Pyhton ka chilla¶

Chapter - 1 How to use jupyter notebook¶

Our basics of python¶

01-Our First Program¶

In [ ]:
# Write print and then () and inside bracket add "" and write any thing you want to print 
print("Hello World")
print(2+3)
print("Python with Ammar")
Hello World
5
Python with Ammar

02-Operators¶

In [ ]:
# Addition
print(2+3)
# Subtraction
print(3-2)
# Multiplication
print(48*3)
#Divivion for getting a floating no. like 3.25
print(22/3) #floating numbers
# Division for getting a whole no 
print(11//3) #for whole number 
# Getting a power
print(2**3)
# Getting a percentage 
print(34%2)
# Adding simultaniouly all funtion according to PEMDAS
print(2**3/2*3/3+6-4+2)

#PEMDAS
#Parenthesis Exponenets Multiply Addition Substraction
#left to right sequencefor M D & A S* 
5
1
144
7.333333333333333
3
8
0
8.0

03-Strings¶

In [ ]:
# Anything we write inside the "" is our string 
print("Hello World")
print("Python with Ammar")
print('test for single quote')
print("test for double quote")
print('''test for trriple quotes''')
print("Whats's Up")
print("  what's up    ?")
print("srtring_clear")
Hello World
Python with Ammar
test for single quote
test for double quote
test for trriple quotes
Whats's Up
  what's up    ?
srtring_clear

04-Comments¶

In [ ]:
# Adding anything to comment by pressing (Ctrl+/) 
print("How are you?") #press these to comment out (Ctrl+/)
print (" We are learning python with ammar") #print a string
print(2+3) #print operators function withnumbers
How are you?
 We are learning python with ammar
5

05-Variable¶

In [ ]:
#Variables: objects containing specific values 
x=5 #Numeric or integer variable
print (x)
y= "we are learning Python with ammar" #string variable 
print(y)
x= x+15
print(x) #answer will be 20 beacause lines are updating from upward to downward continuously

#types/class of variables
type(x)
print(type(x))
print(type(y))

#print_type_class

#rules to assign a variable 
#1 . Variable should contains letter , numbers or underscores
#2 . Do not start with numbers Like 2y instead of y only
#3 . Spaces are not allowed in variable name 
#4 . Do not use keywords used in functions like break ,  mean , media, test, etc
#5 . Short and Descriptive 
#6 . Case sensitivity (Lowercase , upper case letters , lower case letters should be used)

fruit_basket= 10
fruit_basket= "Mangoes"
print(type (fruit_basket))
#del fruit_basket
print(fruit_basket)
5
we are learning Python with ammar
20
<class 'int'>
<class 'str'>
<class 'str'>
Mangoes

06-Input Variables¶

In [ ]:
# greetings= "Assalam-u-Alikum "
# asking=",kia hal hain?"
# print(greetings,name,asking)

#3rd Stage input function
name=input("what is your name? ")
age=input("How old are you? ")
greetings="Hello"

print(greetings,name,",you are still young bro ")

#input_ammar_ You are still young 
what is your name? Muzammil 
How old are you? 18
Hello Muzammil  ,you are still young bro 

07-Conditional Logics¶

In [ ]:
#logical operators are either "true or false or yes or no or 0 or 1"
#equal to                   ==
#not equal to               !=
#less than                  <
#greater than               >
#less then and equal to     <=
#greater than and wqualto   >=

# question.. is 4 equal to 4
# print(4==4)
# print(4!=4)
# print(4>3)
# print(5<4)
# print(3<=5)
# print(3>=5)

# #application of logical operators 
# hammad_age=4
# age_at_school=5
# print(hammad_age==age_at_school)

#input function and logical operator
age_at_school= 5 #variable
hammad_age=input("What is the age of Hammad ?") #input function
hammad_age=int(hammad_age) #changing type of variavble
print(type(hammad_age))
print(hammad_age==age_at_school) #logical operator

#convert input 
What is the age of Hammad ?5
<class 'int'>
True

08-Type Conversion¶

In [ ]:
# x=10        #integer
# y=10.2      #float
# z="hello"   #string

# print(type(x))
# print(type(y))
# print(type(z))

#implicit type conversion
# x=x*y

# print(x, type(x))

#explicit type conversion
# age=input("what is your age? ")
# print(type(float(age)))
# # age=int(age)
# # print(type(int(age)))
# print(age,type(str(age)))

name=input("what is your name ?")
print(name, type(str(name)))

#type_conversion
what is your name ?Muzammil
Muzammil <class 'str'>

09-if, else and elif¶

In [ ]:
required_age_at_school=4
hammad_age=1

#question: Can hammad goto school 
if hammad_age==required_age_at_school:
    print("Congratulation!! Hammad can join the school")
elif hammad_age > required_age_at_school :
    print("Hammad should join higher school")
elif hammad_age<=2:
    print("You should take care of Hammad he is still a baby ")
else:
    print("Hammad Can not go to school ")

#i, elif, else statement clear .
You should take care of Hammad he is still a baby 

10-Functions¶

In [ ]:
#1

#defining a functions
# def print_codanics():
#      print("We are learning with ammar")
#      print("We are learning with ammar")
#      print("We are learning with ammar")
    
# print_codanics()

#2
# def print_code():
#     text= "we are learning python with ammar "
#     print(text)
#     print(text)
#     print(text)

# print_code()

#3
# def print_code(text):
#     print(text)
#     print(text)
#     print(text)

# print_code("We are learning python")

#4
#defining a function with if elif and else statement

# def school_calculator(age):
#     if age==5:
#         print("Hammad can join the school")
#     elif age>5:
#         print("Hammad should go to higher school")
#     else:
#         print("Hammad is still a baby")

# school_calculator(5)

#defining a function of future 
# def future_age(age):
#     new_age= age+20
#     return new_age
#     print(new_age)
#     # print(new_age)
# furture_age=future_age(3)
# print(furture_age)

#i understand functions really well

# def repeat_ali_4times():
#     text= ("ALI")
#     print(text)
#     print(text)
#     print(text)
#     print(text)

# repeat_ali_4times()


#practice again 
# text= input("what do you want to write 5 times")
# def write_4_times(text):
#     print(text)
#     print(text)
#     print(text)
#     print(text)

# write_4_times(text)
name=input("What is your name? ")
age=int(input("What is your age ? "))
greetings= ("Hello")

def school_extrance_calculator(age):
    print(greetings,name)
    if age>=5 and age<9:
        print("You are welcome to school")
    elif age<5:
        print("You are not eligible")
    elif age>=10 and age<15:
        print("You should go to higher school")
    else:
        print("you should go to university")

school_extrance_calculator(age)
What is your name? Muzammil
What is your age ? 8
Hello Muzammil
You are welcome to school

11-Loops¶

In [ ]:
# #while loops and for loops 
# # while loops 

# x=0
# while(x<=5):
#     print (x)
#     x=x+1


#for loop
# for x in range (4,11):
#     print(x)


#array 
days= ["mon", "tue", "wed", "thurs", "fri", "sat", "sund"]
for d in days:
    if d=="fri": break #Stop the loop
    # if d=="fri": continue #skip that entity 
    print(d)
mon
tue
wed
thurs

12-Import Library¶

In [ ]:
#if you want to print the value of pi
import math
print("The value of pi is ",math.pi)
print(type(math.pi))
The value of pi is  3.141592653589793
<class 'float'>
In [ ]:
import statistics
x= [150, 250,350,450]
print(statistics.mean(x))
#some important libraries
#numpy, pandas
The value of pi is  3.141592653589793
<class 'float'>
300

13-TroubleShooting¶

In [ ]:
#print(We are learning) #syntax error
#print(25/0) #runtime error

name="ammar"
print("Hello", name)

#trouble shooting is easy
Hello ammar

14-Practice¶

In [ ]:
name= input("What is your name ? ")
print(name)
age= int(input("What is your age? "))
print(age)
print(type(age))

if age==24:
    print(name,"You are still young bro")
elif age<24:
    print(name, "Bachay you are still a baby")
elif age>24 and age<100:
    print(name ,"saab Babay ho rhe ho , shadi krwa lo")
elif age>=100 and age<200:
    print(name, "Mar ja bhai ") 
else:
    print("Tu mar chuka hai ")
What is your name ? Muzammil
Muzammil
What is your age? 24
24
<class 'int'>
Muzammil You are still young bro

BMI Calculator¶

In [ ]:
#units of BMI is wiegh in kg / height in m and its square 
name= input("What is your name ?  ")
greetings= ("Hello" ,name)
greetings
weight= float(input("what is your weight?  "))
height= float(input("and your height ?  "))


bmi= weight/height**2
bmi

print(name ,"your BMI IS", bmi)
What is your name ?  Muzammil 
what is your weight?  100
and your height ?  150
Muzammil  your BMI IS 0.0044444444444444444

Chapter 2- Basics¶

Indexing¶

In [ ]:
#make a string
a= "Samosa Pakora"
a
Out[ ]:
'Samosa Pakora'
In [ ]:
#checking the value at index 0
a[0]
#counting will star from 0 to onwards in python 
Out[ ]:
'S'
In [ ]:
a[1]
Out[ ]:
'a'
In [ ]:
a[2]
Out[ ]:
'm'
In [ ]:
a[6]
#it will print a space
Out[ ]:
' '
In [ ]:
len(a)
#it will show the number of index in our string
Out[ ]:
13
In [ ]:
a[0:6]
#the last no will be excludes like i ask for letters from 0 to 6 but it will print from 0 to 5 letters 
Out[ ]:
'Samosa'
In [ ]:
a[1:8]
#here P is 7th character as we strat counting from 0 
Out[ ]:
'amosa P'
In [ ]:
a[0:13] #here if we count from 0 to 13 it will be total 14 characters here 13 no is exclusive and count complterd from 0 to 12 total as 13 
Out[ ]:
'Samosa Pakora'
In [ ]:
a[-2]
#here it starts from right side and will start from number (-1)
Out[ ]:
'r'
In [ ]:
a[-1:-6]

#here it will not print any thing 
Out[ ]:
''
In [ ]:
a[-6:-1]
#here we see the writing sequence in string will remain same from right to left
#also -1 no is "a" but it will not print here as last no is exclude
Out[ ]:
'Pakor'
In [ ]:
a[-6:0]
Out[ ]:
''
In [ ]:
a[-6:13]
Out[ ]:
'Pakora'
In [ ]:
food= "birYani"
food
Out[ ]:
'birYani'

String Methods¶

In [ ]:
food
Out[ ]:
'birYani'
In [ ]:
#Checking the length 
len(food)
Out[ ]:
7
In [ ]:
# Capitalize 
food.capitalize()
Out[ ]:
'Biryani'
In [ ]:
#Upper case letters
food.upper()
Out[ ]:
'BIRYANI'
In [ ]:
#lower case letters
food.lower()
Out[ ]:
'biryani'
In [ ]:
#replace
food.replace("b", "sh")
Out[ ]:
'shirYani'
In [ ]:
#counting a specific alphabet in a string 
name = "baba_aammar with Dr aamar tufail"
name
Out[ ]:
'baba_aammar with Dr aamar tufail'
In [ ]:
name.count("a")
Out[ ]:
9
In [ ]:
name.count("D")
Out[ ]:
1
In [ ]:
#how to find a number of index in string
name = "baba_aammar with Dr aamar tufail"
name
Out[ ]:
'baba_aammar with Dr aamar tufail'
In [ ]:
name.find("t")
Out[ ]:
14
In [ ]:
# how to split a string
food = "i love samosa , pakora , raita, biryani and karahi"
food
Out[ ]:
'i love samosa , pakora , raita, biryani and karahi'
In [ ]:
food.split(",")
Out[ ]:
['i love samosa ', ' pakora ', ' raita', ' biryani and karahi']

Basic data Structure in Python¶

1-Tuple¶

2-List¶

3-Dictionaries¶

4-Set¶

Tuple¶

  • Ordered collection of elements
  • eclosed in () round braces/ paranthesis
  • Different kind of elements can be stored (elements like int, float , string, boolean{true, false})
  • Once elements are stored you can not change or replace them (Unmutatable)
In [ ]:
tup1 = (1,"python" , True , 2.5)
tup1
Out[ ]:
(1, 'python', True, 2.5)
In [ ]:
#type of a tuple 
type(tup1)
Out[ ]:
tuple

-indexing in tuple¶

In [ ]:
tup1[1]
Out[ ]:
'python'
In [ ]:
tup1[0]
Out[ ]:
1
In [ ]:
tup1[0:6]
Out[ ]:
(1, 'python', True, 2.5)
In [ ]:
tup1[0:3] # last element is exclusive
Out[ ]:
(1, 'python', True)
In [ ]:
#length of tuple
len(tup1)
Out[ ]:
4
In [ ]:
tup2 = (2, "baba ammar", 3.5, False )
tup2
Out[ ]:
(2, 'baba ammar', 3.5, False)
In [ ]:
# concatinate ( TO add two or more tuple)
tup1+tup2
Out[ ]:
(1, 'python', True, 2.5, 2, 'baba ammar', 3.5, False)
In [ ]:
#concatinate + repeat 
tup1*3 + tup2
Out[ ]:
(1,
 'python',
 True,
 2.5,
 1,
 'python',
 True,
 2.5,
 1,
 'python',
 True,
 2.5,
 2,
 'baba ammar',
 3.5,
 False)
In [ ]:
tup1*2 + tup2
Out[ ]:
(1, 'python', True, 2.5, 1, 'python', True, 2.5, 2, 'baba ammar', 3.5, False)
In [ ]:
tup3 = (20, 50, 60, 80, 96)
tup3
Out[ ]:
(20, 50, 60, 80, 96)
In [ ]:
max(tup3)
Out[ ]:
96
In [ ]:
min(tup3)
Out[ ]:
20
In [ ]:
tup3*2
Out[ ]:
(20, 50, 60, 80, 96, 20, 50, 60, 80, 96)

mark down it and add three dashes - will creat space line

List¶

  • ordered collection of elements
  • enclosed in [] square barckets
  • Mutateable, you can change the values
In [ ]:
list1 = [2, "baba ammar" , False]
list1
Out[ ]:
[2, 'baba ammar', False]
In [ ]:
type(list1)
Out[ ]:
list
In [ ]:
len(list1)
Out[ ]:
3
In [ ]:
list1[2]
Out[ ]:
False
In [ ]:
list2 = [3, 5, "Aammar", "Codanics", 478, 53.2, True]
list2
Out[ ]:
[3, 5, 'Aammar', 'Codanics', 478, 53.2, True]
In [ ]:
list1 + list2
Out[ ]:
[2, 'baba ammar', False, 3, 5, 'Aammar', 'Codanics', 478, 53.2, True]
In [ ]:
list1*2
Out[ ]:
[2, 'baba ammar', False, 2, 'baba ammar', False]
In [ ]:
list1
Out[ ]:
[2, 'baba ammar', False]
In [ ]:
list1.reverse()
list1
Out[ ]:
[False, 'baba ammar', 2]
In [ ]:
list1.append("codanics youtube channel")
list1
Out[ ]:
[False, 'baba ammar', 2, 'codanics youtube channel']
In [ ]:
list1.count(False)
Out[ ]:
1
In [ ]:
list3 = [20,30,40,50,60,52,562,488,2485]
list3
Out[ ]:
[20, 30, 40, 50, 60, 52, 562, 488, 2485]
In [ ]:
len(list3)
Out[ ]:
9
In [ ]:
#sorting a List
list3.sort()
list3
Out[ ]:
[20, 30, 40, 50, 52, 60, 488, 562, 2485]
In [ ]:
#repeat
list3*3
Out[ ]:
[20,
 30,
 40,
 50,
 52,
 60,
 488,
 562,
 2485,
 20,
 30,
 40,
 50,
 52,
 60,
 488,
 562,
 2485,
 20,
 30,
 40,
 50,
 52,
 60,
 488,
 562,
 2485]
In [ ]:
list2+list3
Out[ ]:
[3,
 5,
 'Aammar',
 'Codanics',
 478,
 53.2,
 True,
 20,
 30,
 40,
 50,
 52,
 60,
 488,
 562,
 2485]
In [ ]:
lists= list1 +list2
lists
Out[ ]:
[False,
 'baba ammar',
 2,
 'codanics youtube channel',
 3,
 5,
 'Aammar',
 'Codanics',
 478,
 53.2,
 True]

3- Dictionaries¶

  • AN unirdered collection of element
  • Key and value
  • Curly braces/ braces {}
  • Mutateable , you can change the value
In [ ]:
#Food and thier prices 
food1= {"Samosa" : 30, "Pakora" : 100, "Raita" : 20, "Salad" : 50, "Chicken Rolls": 30,}
food1
Out[ ]:
{'Samosa': 30, 'Pakora': 100, 'Raita': 20, 'Salad': 50, 'Chicken Rolls': 30}
In [ ]:
type(food1)
Out[ ]:
dict
In [ ]:
#extract data 
keys= food1.keys()
keys
Out[ ]:
dict_keys(['Samosa', 'Pakora', 'Raita', 'Salad', 'Chicken Rolls'])
In [ ]:
values = food1.values()
values
Out[ ]:
dict_values([30, 100, 20, 50, 30])
In [ ]:
#adding new element
food1["Tikki"]=10
food1
Out[ ]:
{'Samosa': 30,
 'Pakora': 100,
 'Raita': 20,
 'Salad': 50,
 'Chicken Rolls': 30,
 'Tikki': 10}
In [ ]:
#updating a values
food1["Tikki"]= 15
food1
Out[ ]:
{'Samosa': 30,
 'Pakora': 100,
 'Raita': 20,
 'Salad': 50,
 'Chicken Rolls': 30,
 'Tikki': 15}
In [ ]:
food2 = {"Dates": 50, "Chocolates":200, "Sawayyan":1000}
food2
Out[ ]:
{'Dates': 50, 'Chocolates': 200, 'Sawayyan': 1000}
In [ ]:
#Concatinate
food1.update(food2)
food1
Out[ ]:
{'Samosa': 30,
 'Pakora': 100,
 'Raita': 20,
 'Salad': 50,
 'Chicken Rolls': 30,
 'Tikki': 15,
 'Dates': 50,
 'Chocolates': 200,
 'Sawayyan': 1000}

4-Sets¶

  • An unordered and un-indexed
  • Curly braces {} are used
  • no duplicates allowed
In [ ]:
s1= {1, 2, 2.2, 5, "Codanics", "Faisalabad", True}
s1
# here we see that boolean operator dosent print in sets 
Out[ ]:
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad'}
In [ ]:
s1.add("codanics")
s1
Out[ ]:
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad', 'codanics'}
In [ ]:
s1.add("Faisalabad")
s1
Out[ ]:
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad', 'codanics'}
In [ ]:
s1.remove("codanics")
s1
Out[ ]:
{1, 2, 2.2, 5, 'Codanics', 'Faisalabad'}

Chapter 3¶

Graphs¶

First graph¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 
sns.set_theme(style="ticks", color_codes= True)
titanic = sns.load_dataset("titanic")
sns.catplot(x="sex", y="survived", hue="class", kind="bar", data=titanic)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x14152a7df40>

Second Graph¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 
sns.set_theme(style="ticks", color_codes=True)
In [ ]:
titanic= sns.load_dataset("titanic")
titanic
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

In [ ]:
p1=sns.countplot(x="who", data=titanic, hue="alone")
p1.set_title("PLot for Counting")
Out[ ]:
Text(0.5, 1.0, 'PLot for Counting')
In [ ]:
#scatter plot 
import seaborn as sns
import matplotlib.pyplot as plt 
In [ ]:
sns.set_theme(style="ticks", color_codes=True)
titanic = sns.load_dataset("titanic")
g=sns.FacetGrid(titanic, row="sex", hue= "alone")
g=(g.map(plt.scatter,"age", "fare").add_legend())

01_Line Plot¶

Impot library¶

  • Seaborn (automatically install these libraries

  • numpy

  • scipy
  • pandas
  • Matplotlib
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

#load data set 

phool = sns.load_dataset("iris")
phool

#draw a line plot
# sns.lineplotin(x="", y="", data=phool)
#lie plot always work betwwen numeric values 
Out[ ]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

150 rows × 5 columns

In [ ]:
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)

plt.show()
In [ ]:
#Program with title 

import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()

Adding limits¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.xlim(2)
plt.ylim(1)
plt.show()

Set style¶

  • darkgrid
  • whitegrid
  • dark
  • white
  • ticks
In [ ]:
#to remove alreay or default style first 
sns.set_style(style= None , rc=None )
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
#sns.set_style(style= None , rc=None)
sns.set_style("darkgrid")
plt.xlim(2)
plt.ylim(1)
plt.show()

Size of figure¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
#after loading data set 
#changing size of final figure 
plt.figure(figsize=(12,10))
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()

Line Plot assignment 1¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 
phool = sns.load_dataset("iris")
plt.figure(figsize=(12,10))
sns.lineplot(x="sepal_length", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
sns.set_style(style= None , rc=None)
sns.set_style("darkgrid")
plt.xlim(3)
plt.ylim(1.5)
plt.show()

02_Bar Plot¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
phool 

sns.barplot(x="species", y="sepal_width", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
In [ ]:
phool
Out[ ]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

150 rows × 5 columns

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

phool = sns.load_dataset("iris")
phool 

sns.barplot(x="species", y="petal_length", data=phool)
plt.title("Phoolo ka Plot")
plt.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
kashti
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="who",y="alone", data=kashti)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
##  order setting 
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] )
plt.title("titanic ka Plot")
plt.show()
In [ ]:
#changing color 
##  order setting 
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,color= "grey")
plt.title("titanic ka Plot")
plt.show()
In [ ]:
#removing error bars
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
#using different paletts
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None , palette= "pastel")
plt.title("titanic ka Plot")
plt.show()

#we can search for seaborne built in color palettes
In [ ]:
# intensity of colors 
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="sex",y="alone", hue="who", data=kashti, order=["female" ,"male"] ,ci= None , saturation=.5)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
#HOrizontal Plot 
#numeric parameter on x axis 
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="fare", y="class", hue="sex", data=kashti ,ci= None)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="fare", y="class", hue="sex", data=kashti ,ci= None)
plt.title("titanic ka Plot")
plt.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 

kashti = sns.load_dataset("titanic")
sns.barplot(x="class", y="fare", data=kashti ,linewidth=2.5,facecolor=(1,1,1,1) , errcolor= "0.5" , edgecolor= "0.5" )

plt.title("titanic ka Plot")
plt.show()
In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 
#canvas (baloon board)
sns.set(style="whitegrid")
kashti= sns.load_dataset("titanic")

sns.boxplot(x="class",y="fare", data=kashti)
plt.show()
In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
tip
Out[ ]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy #estimator will not work here
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")

sns.boxplot(x="day",y="tip", data=tip, saturation=1)
plt.show()
In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np

sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
tip.describe()
Out[ ]:
total_bill tip size
count 244.000000 244.000000 244.000000
mean 19.785943 2.998279 2.569672
std 8.902412 1.383638 0.951100
min 3.070000 1.000000 1.000000
25% 13.347500 2.000000 2.000000
50% 17.795000 2.900000 2.000000
75% 24.127500 3.562500 3.000000
max 50.810000 10.000000 6.000000
In [ ]:
#  catagorical variable must be draw in x-axis or hue
#     numeric variable in y-axis
#     numerical variable should not be go to hue
In [ ]:
#for single 

import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")

sns.boxplot(x=tip["tip"])
Out[ ]:
<AxesSubplot:xlabel='tip'>
In [ ]:
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(y=tip["total_bill"])
Out[ ]:
<AxesSubplot:ylabel='total_bill'>
In [ ]:
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", data= tip)
Out[ ]:
<AxesSubplot:xlabel='tip', ylabel='day'>
In [ ]:
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2")
Out[ ]:
<AxesSubplot:xlabel='tip', ylabel='day'>
In [ ]:
#dodge
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2", dodge=True)
Out[ ]:
<AxesSubplot:xlabel='tip', ylabel='day'>
In [ ]:
import seaborn as sns
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
sns.boxplot(x="tip", y="day", hue="smoker" ,data= tip, palette ="Set2", dodge=False)
Out[ ]:
<AxesSubplot:xlabel='tip', ylabel='day'>
In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy
sns.set(style="whitegrid")
tip= sns.load_dataset("tips")
my_color= {"Yes":"#0d9ea8","No":"#6e0c11"}

sns.boxplot(x="tip",y="day", hue="smoker", data=tip, saturation=1 ,palette=my_color )
plt.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np

kashti= sns.load_dataset("titanic")
kashti.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np

kashti= sns.load_dataset("titanic")
sns.boxplot(x="survived", y="age", data= kashti)
Out[ ]:
<AxesSubplot:xlabel='survived', ylabel='age'>
In [ ]:
kashti= sns.load_dataset("titanic")
p1 = sns.boxplot(x="survived", y="age", showmeans=True, data= kashti)
p1
Out[ ]:
<AxesSubplot:xlabel='survived', ylabel='age'>
In [ ]:
kashti= sns.load_dataset("titanic")
p1 = sns.boxplot(x="survived", y="age", showmeans=True, meanprops={"marker":"+", "markersize":"12", "markeredgecolor":"red"} ,data= kashti)
p1
Out[ ]:
<AxesSubplot:xlabel='survived', ylabel='age'>
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import pandas as pd

kashti= sns.load_dataset("titanic")
sns.boxplot(x="survived", y="age", showmeans=True, meanprops={"marker":"+", "markersize":"12", "markeredgecolor":"red"} ,data= kashti)
plt.title("Kitne doobay or kitne bachay" , size= 25, weight="bold")
plt.xlabel ("Kitnay bach gaey ", size= 16)
plt.ylabel ("umar kia hai", size = 16   )
Out[ ]:
Text(0, 0.5, 'umar kia hai')

BoxPlot_ Assignment 2¶

In [ ]:
#imporing libraries
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas
#setting canvas 
sns.set(style="darkgrid")
sns.set_style(style= None , rc=None)
#loading dataset 
tip= sns.load_dataset("tips")
#adjusting size of my final graph
plt.figure(figsize=(12,12))
#making our palette for diffining colors for each hue from hex color picker 
my_color= {"Yes":"#0d9ea8","No":"#6e0c11"}
#plotting a graph 
sns.boxplot(x="tip",y="day", hue="smoker", data=tip, saturation=1 ,palette=my_color )
plt.show()

Assignment with new data set¶

In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
#importing data set 
my_data= pd.read_csv("new_data.csv")
my_data.head()
Out[ ]:
Gender Location Age_range Qualification field_of_study Purpose_for_chilla Work_status Blood_group Mobile_sim Sim_type ... Your favorite programming language? Marital Status? Are you Vaccinated? Where do you live? Working experience Age Weight_in_kg Height_in_cm How many hours you code a day? (int) Light kitni der band hti hy? int
0 Male Pakistan 36-40 Masters Natural Sciences to boost my skill set Unemplyed B+ U-fone Prepaid ... Python Yes Yes Urbun 5.0 38.00 77.0 179.000 3.0 2
1 Male Pakistan 26-30 Bachelors CS/IT to boost my skill set Student B+ U-fone Prepaid ... Python No Yes Urbun 1.0 25.00 53.6 178.000 2.0 6
2 Male Pakistan 31-35 Masters Enginnering Switch my field of study Employed B+ Zong Prepaid ... Python Yes Yes Urbun 5.5 31.34 93.0 173.000 2.0 0
3 Female Pakistan 31-35 Masters CS/IT to boost my skill set Employed O+ U-fone Postpaid ... Python Yes Yes Urbun 5.0 33.00 60.0 157.000 3.0 24
4 Female Pakistan 26-30 Masters Enginnering to boost my skill set Student A- Mobilink Prepaid ... Javascript No Yes Rural 3.5 27.00 59.9 164.544 6.0 12

5 rows × 23 columns

In [ ]:
#importing libraries 
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
#importing data set 
my_data= pd.read_csv("new_data.csv")
#setting canvas 
sns.set(style="darkgrid")
#setting final figure size 
plt.figure(figsize=(18,12))


sns.boxplot(x="Qualification", y="Weight_in_kg" ,hue="Gender", dodge= True ,data= my_data, saturation= 1, showmeans=True, meanprops={"marker":"+", "markersize":"15", "markeredgecolor":"red"})
plt.title("Kon Ziada Khata hai ?", size= 28, weight= "bold")
plt.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Working experience", color="Gender", symbol="Marital Status?", facet_col="Qualification",
          labels={"Gender": "Sex", "Marital Status?": "Married"})
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Age", y="Working experience", color="Gender", symbol="Marital Status?", facet_col="Qualification",
          labels={"Gender": "Sex", "Marital Status?": "Married"})
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Age", color="Location",
    size="Working experience", size_max=45, log_x=True)

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1
))

fig.show()
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")
plt.figure(figsize=(10,10))
import pandas as pd 
my_data= pd.read_csv("new_data.csv")
sns.boxenplot(x="Location", y="Weight_in_kg",
              color="b",
              scale="linear", data=my_data)

plt.title("Konsi Country k loog ziada khate hain" ,size=25 , weight="bold")
Out[ ]:
Text(0.5, 1.0, 'Konsi Country k loog ziada khate hain')
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.area(data, x="Age", y="Weight_in_kg", color="Qualification", line_group="Location")
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")

fig = px.sunburst(data, path=['Qualification', 'Location'], values='Age',
                  color='Working experience', hover_data=['Marital Status?'])
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.violin(data, y="Age", x="Marital Status?", color="Gender", box=True, points="all", hover_data=data.columns)
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")

fig = px.density_contour(data, x="Age", y="Weight_in_kg")
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")

fig = px.line_polar(data, r="Age", theta="Location", color="Qualification", line_close=True,
            color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()
In [ ]:
import plotly.express as px
import pandas as pd 
data= pd.read_csv("new_data.csv")
fig = px.scatter(data, x="Weight_in_kg", y="Height_in_cm", color="Qualification", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white", labels={"Height_in_cm" : "Height", "Weight_in_kg": "Weight", "Male": "Munda" })

fig.show()

Chapter 4- Numpy¶

1-D Array¶

In [ ]:
import numpy as np
In [ ]:
a= np.array([1,2,3,4,5])
a
Out[ ]:
array([1, 2, 3, 4, 5])
In [ ]:
type(a)
Out[ ]:
numpy.ndarray
In [ ]:
len(a)
Out[ ]:
5
In [ ]:
# creating a single axis array of number zero 
c= np.zeros(2)
c
Out[ ]:
array([0., 0.])
In [ ]:
# creating a single axis array of number One 
d= np.ones(10)
d
Out[ ]:
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
In [ ]:
e=np.empty(3)
e
Out[ ]:
array([9.96959917e-312, 0.00000000e+000, 4.94065646e-324])
In [ ]:
# with the specific range of elements 
g= np.arange(5,15) # as we arleady know the last no. is exclusive 
g
Out[ ]:
array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14])
In [ ]:
# with the range of elements with a speciefied gap
h=np.arange(2,20,2) # goin from 2 to 20 with a specified gap of 2 and last no. is exlcusive
h
Out[ ]:
array([ 2,  4,  6,  8, 10, 12, 14, 16, 18])
In [ ]:
# and if we want 20 no. also in last arange
h=np.arange(2,21,2)
h
Out[ ]:
array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])
In [ ]:
# Linearly spaced arrays 
i= np.linspace(0,15 , num= 5) # GOing 0 to 15 in just 5 numbers in a way that the distance between each no. will remain same
i
Out[ ]:
array([ 0.  ,  3.75,  7.5 , 11.25, 15.  ])
In [ ]:
j= np.ones(5, dtype=np.float64)
j
Out[ ]:
array([1., 1., 1., 1., 1.])

2-D Array¶

In [ ]:
b= np.array([[2,2,2,2],[3,3,3,3]])
b
Out[ ]:
array([[2, 2, 2, 2],
       [3, 3, 3, 3]])

2- axis¶

In b

  • First axis has a length = 2
  • Second axis has lenght = 4
In [ ]:
e= np.array([[1,1,1,1],[2,2,2,2]])
e
Out[ ]:
array([[1, 1, 1, 1],
       [2, 2, 2, 2]])
In [ ]:
k=np.zeros((3,4))
k
Out[ ]:
array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])
In [ ]:
l=np.zeros((5,6))
l
Out[ ]:
array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])
In [ ]:
m=np.ones((2,4))
m
Out[ ]:
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.]])
In [ ]:
d= np.array([[2,2,2],[2,2,2],[6,5,4]])
d
Out[ ]:
array([[2, 2, 2],
       [2, 2, 2],
       [6, 5, 4]])
In [ ]:
f= np.array ([[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]])
f
Out[ ]:
array([[4, 5, 6, 4],
       [8, 6, 4, 5],
       [8, 6, 4, 2],
       [9, 6, 3, 2]])
In [ ]:
g= np.array([[1,2,3],[2,3,4],[4,5,6]])
g
Out[ ]:
array([[1, 2, 3],
       [2, 3, 4],
       [4, 5, 6]])

3-D Array¶

In [ ]:
# TensorFlow is a library use for 3 Dimensional things
# TensorFlow is also a free and open source software libraryfor machine learning and artificial intelligence 
In [ ]:
#making and reshaping a 3D array
c= np.arange(24) .reshape (2,3,4) # First axis has  length = 2,,Second Axis has length = 3 , Third axis has length = 4
c
Out[ ]:
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])
In [ ]:
d= np.zeros((2,3,3))
d
Out[ ]:
array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])
In [ ]:
f= np.ones((3,4,5) , dtype= np.int64)
f
Out[ ]:
array([[[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]],

       [[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]],

       [[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]]], dtype=int64)
In [ ]:
f= np.array ([[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]],[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]],[[4,5,6,4],[8,6,4,5],[8,6,4,2],[9,6,3,2]]])
f
Out[ ]:
array([[[4, 5, 6, 4],
        [8, 6, 4, 5],
        [8, 6, 4, 2],
        [9, 6, 3, 2]],

       [[4, 5, 6, 4],
        [8, 6, 4, 5],
        [8, 6, 4, 2],
        [9, 6, 3, 2]],

       [[4, 5, 6, 4],
        [8, 6, 4, 5],
        [8, 6, 4, 2],
        [9, 6, 3, 2]]])
In [ ]:
z= np.array ([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]],[[13,14,15],[16,17,18]]])
z
Out[ ]:
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]],

       [[13, 14, 15],
        [16, 17, 18]]])

Numpy Practice Session¶

In [ ]:
#importing numpy library 
import numpy as np

creating an array using numpy¶

In [ ]:
import numpy as np
food= np.array(["pakora" , "samosa" , "raita"])
food
Out[ ]:
array(['pakora', 'samosa', 'raita'], dtype='<U6')
In [ ]:
price = np.array([5,5,5])
price 
Out[ ]:
array([5, 5, 5])
In [ ]:
#Checking type of array 
type(price)
Out[ ]:
numpy.ndarray
In [ ]:
type(food)
Out[ ]:
numpy.ndarray
In [ ]:
#length of array 
len(food)
Out[ ]:
3
In [ ]:
#indexing 
price[2]
Out[ ]:
5
In [ ]:
price[0:]
Out[ ]:
array([5, 5, 5])
In [ ]:
#index no to find the index in an array 
food[1]
Out[ ]:
'samosa'
In [ ]:
price.mean()
Out[ ]:
5.0
In [ ]:
# zeros method 
a= np.zeros(6)
a
Out[ ]:
array([0., 0., 0., 0., 0., 0.])
In [ ]:
# ones method 
b= np.ones(5)
b
Out[ ]:
array([1., 1., 1., 1., 1.])
In [ ]:
c= np.empty(5)
c
Out[ ]:
array([1., 1., 1., 1., 1.])
In [ ]:
# Making a Range 
a= np.arange(10)
a
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [ ]:
# Specified range 
a= np.arange(2,21)
a
Out[ ]:
array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20])
In [ ]:
# specific arang with specific distance 
a= np.arange (2,20,3)
a
Out[ ]:
array([ 2,  5,  8, 11, 14, 17])
In [ ]:
#table of 5
a= np.arange (5,55,5)
a
Out[ ]:
array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
In [ ]:
# with line space 
a= np.linspace (0,10, num = 6 ,dtype= np.int64 )
a
Out[ ]:
array([ 0,  2,  4,  6,  8, 10], dtype=int64)
In [ ]:
b= np.linspace(1,100, num = 40)
b
Out[ ]:
array([  1.        ,   3.53846154,   6.07692308,   8.61538462,
        11.15384615,  13.69230769,  16.23076923,  18.76923077,
        21.30769231,  23.84615385,  26.38461538,  28.92307692,
        31.46153846,  34.        ,  36.53846154,  39.07692308,
        41.61538462,  44.15384615,  46.69230769,  49.23076923,
        51.76923077,  54.30769231,  56.84615385,  59.38461538,
        61.92307692,  64.46153846,  67.        ,  69.53846154,
        72.07692308,  74.61538462,  77.15384615,  79.69230769,
        82.23076923,  84.76923077,  87.30769231,  89.84615385,
        92.38461538,  94.92307692,  97.46153846, 100.        ])
In [ ]:
# specifing the data type 
a= np.ones(5, dtype= np.int8)
a
Out[ ]:
array([1, 1, 1, 1, 1], dtype=int8)
In [ ]:
a= np.ones(50, dtype= np.float64)
a
Out[ ]:
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

Array functions¶

In [ ]:
a= np.array([ 10,12,15,2,4,6,18,100,18,16,10.3,0.5])
a
Out[ ]:
array([ 10. ,  12. ,  15. ,   2. ,   4. ,   6. ,  18. , 100. ,  18. ,
        16. ,  10.3,   0.5])
In [ ]:
#sorting an array
a.sort()
a
Out[ ]:
array([  0.5,   2. ,   4. ,   6. ,  10. ,  10.3,  12. ,  15. ,  16. ,
        18. ,  18. , 100. ])
In [ ]:
b= np.array([10.5,5,15.6,8,0.5,10.5,100.9,15,16,59])
b
Out[ ]:
array([ 10.5,   5. ,  15.6,   8. ,   0.5,  10.5, 100.9,  15. ,  16. ,
        59. ])
In [ ]:
c= np.concatenate((a,b))
c
Out[ ]:
array([  0.5,   2. ,   4. ,   6. ,  10. ,  10.3,  12. ,  15. ,  16. ,
        18. ,  18. , 100. ,  10.5,   5. ,  15.6,   8. ,   0.5,  10.5,
       100.9,  15. ,  16. ,  59. ])
In [ ]:
c.sort()
c
Out[ ]:
array([  0.5,   0.5,   2. ,   4. ,   5. ,   6. ,   8. ,  10. ,  10.3,
        10.5,  10.5,  12. ,  15. ,  15. ,  15.6,  16. ,  16. ,  18. ,
        18. ,  59. , 100. , 100.9])
In [ ]:
a = np.array ([[1,2,3],[2,6,5]])
a
Out[ ]:
array([[1, 2, 3],
       [2, 6, 5]])
In [ ]:
b = np.array ([[3,6,5],[6,8,9]])
b
Out[ ]:
array([[3, 6, 5],
       [6, 8, 9]])
In [ ]:
#checking the shape of matrix 
b.shape
Out[ ]:
(2, 3)
In [ ]:
c= np.concatenate((a,b) ,axis= 1)
c
Out[ ]:
array([[1, 2, 3, 3, 6, 5],
       [2, 6, 5, 6, 8, 9]])
In [ ]:
c= np.concatenate((a,b) ,axis= 0)
c
Out[ ]:
array([[1, 2, 3],
       [2, 6, 5],
       [3, 6, 5],
       [6, 8, 9]])
In [ ]:
c.shape
Out[ ]:
(4, 3)

# 3-D Array¶

In [ ]:
a= np.array ([[['a','b','c'],['e','d','f']],
            [['a','b','c'],['e','d','f']],
            [['a','b','c'],['e','d','f']]])
a
Out[ ]:
array([[['a', 'b', 'c'],
        ['e', 'd', 'f']],

       [['a', 'b', 'c'],
        ['e', 'd', 'f']],

       [['a', 'b', 'c'],
        ['e', 'd', 'f']]], dtype='<U1')
In [ ]:
#finding a no. of dimensions 
a.ndim
Out[ ]:
3
In [ ]:
a.size
Out[ ]:
18
In [ ]:
# shape of array 
a.shape 
Out[ ]:
(3, 2, 3)
In [ ]:
b= np.array ([[[1,2,3],[7,8,9],[9,6,3]],
            [[1,2,3],[7,8,9],[9,6,3]],
            [[1,2,3],[7,8,9],[9,6,3]]])
b
Out[ ]:
array([[[1, 2, 3],
        [7, 8, 9],
        [9, 6, 3]],

       [[1, 2, 3],
        [7, 8, 9],
        [9, 6, 3]],

       [[1, 2, 3],
        [7, 8, 9],
        [9, 6, 3]]])
In [ ]:
b.ndim
Out[ ]:
3
In [ ]:
type(a)
Out[ ]:
numpy.ndarray
In [ ]:
b.shape
Out[ ]:
(3, 3, 3)
In [ ]:
b.size
Out[ ]:
27

converting 1d to 2d¶

In [ ]:
a= np.arange(9)
a
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
In [ ]:
a.reshape (3,3) #3*3=9 (9 indexes are there in array )
Out[ ]:
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])
In [ ]:
a.shape
Out[ ]:
(9,)
In [ ]:
#row wise conversion
b= a[np.newaxis,:]
b
Out[ ]:
array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
In [ ]:
b.shape
Out[ ]:
(1, 9)
In [ ]:
#coloumn wise conversion
b= a[:, np.newaxis]
b
Out[ ]:
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8]])
In [ ]:
b.shape
Out[ ]:
(9, 1)
In [ ]:
c= np.arange(9)
c
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
In [ ]:
c.shape
Out[ ]:
(9,)
In [ ]:
d=c[np.newaxis, :]
d
Out[ ]:
array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
In [ ]:
d.shape
Out[ ]:
(1, 9)
In [ ]:
a
Out[ ]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
In [ ]:
a[2]
Out[ ]:
2
In [ ]:
a[0:5]
Out[ ]:
array([0, 1, 2, 3, 4])
In [ ]:
a*6
Out[ ]:
array([ 0,  6, 12, 18, 24, 30, 36, 42, 48])
In [ ]:
a+6
Out[ ]:
array([ 6,  7,  8,  9, 10, 11, 12, 13, 14])
In [ ]:
a.sum()
Out[ ]:
36
In [ ]:
a.mean()
Out[ ]:
4.0
In [ ]:
a.max()
Out[ ]:
8
In [ ]:
a.min()
Out[ ]:
0

Chapter 5- Pandas¶

How to install library¶

pip install pandas

pip instal numpy

Importing Libraries¶

In [ ]:
#importing libraries 
import pandas as pd 
import numpy as np 
In [ ]:
# object creation 
s= pd.Series([1,2,np.nan ,5,7,8,9])
s
Out[ ]:
0    1.0
1    2.0
2    NaN
3    5.0
4    7.0
5    8.0
6    9.0
dtype: float64
In [ ]:
dates = pd.date_range("20220101", periods=9)
dates 
Out[ ]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09'],
              dtype='datetime64[ns]', freq='D')
In [ ]:
dates = pd.date_range("20220101", periods=33)
dates 
Out[ ]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
               '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
               '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20',
               '2022-01-21', '2022-01-22', '2022-01-23', '2022-01-24',
               '2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28',
               '2022-01-29', '2022-01-30', '2022-01-31', '2022-02-01',
               '2022-02-02'],
              dtype='datetime64[ns]', freq='D')
In [ ]:
df= pd.DataFrame(np.random.randn(33,5), index= dates, columns= list("ABCDE"))
df
Out[ ]:
A B C D E
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-11 -0.637742 -1.434976 -1.276443 -1.153942 0.761682
2022-01-12 -0.742913 -1.484980 -0.917490 -0.283180 -0.053965
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-15 -0.436737 -0.262544 0.686336 0.144395 0.625462
2022-01-16 -0.152087 0.789873 -1.096439 -0.172554 -1.109436
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-19 -1.454019 0.974247 -0.442348 0.044247 1.190501
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-21 -0.270772 0.698006 -0.801287 -0.182554 0.852357
2022-01-22 -0.528683 0.258139 0.589976 0.325295 1.688333
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-25 -0.344456 1.276411 -0.625939 -1.117180 -0.680828
2022-01-26 -0.073498 0.611255 1.133547 0.358266 -0.694200
2022-01-27 -1.981745 0.243877 -0.122814 0.343524 1.155793
2022-01-28 -0.023202 0.546315 -0.150933 -0.316904 0.220736
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-01-31 -0.752144 -0.738474 0.248588 0.375347 0.729071
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df2= pd.DataFrame(
    {
    "A" : 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index= list(range(4)), dtype="float32"),
    "D": np.array([3]*4, dtype= "int32"),
    "E": pd.Categorical(["test","train","test","train"]),
    "F": "foo"
    }
    )
df2
Out[ ]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
In [ ]:
df2.dtypes
Out[ ]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
In [ ]:
df2.head(2)
Out[ ]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
In [ ]:
df.tail(2)
Out[ ]:
A B C D E
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df2.index
Out[ ]:
Int64Index([0, 1, 2, 3], dtype='int64')
In [ ]:
dates1 = pd.date_range("20220101", periods=20)
dates1 
Out[ ]:
DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10', '2022-01-11', '2022-01-12',
               '2022-01-13', '2022-01-14', '2022-01-15', '2022-01-16',
               '2022-01-17', '2022-01-18', '2022-01-19', '2022-01-20'],
              dtype='datetime64[ns]', freq='D')
In [ ]:
df1= pd.DataFrame(np.random.randn(20,5), index= dates1, columns= list("ABCDE"))
df1
Out[ ]:
A B C D E
2022-01-01 0.388135 -0.694500 1.710742 -0.953853 1.647141
2022-01-02 -0.517575 -0.188147 0.630036 -0.881991 -0.981272
2022-01-03 -1.674983 -0.586637 -1.340947 0.702661 -0.929540
2022-01-04 0.581906 -0.080027 1.137075 0.775445 -1.796040
2022-01-05 -1.273825 0.312914 -1.165314 -0.169640 -0.368941
2022-01-06 0.515602 0.490068 -0.545239 -0.408955 1.333627
2022-01-07 0.169663 -1.019295 -0.668360 -2.786986 -0.199359
2022-01-08 -0.130497 0.102798 0.288203 -0.291382 -0.364559
2022-01-09 -0.369015 1.283904 0.839903 -0.279453 -1.272016
2022-01-10 -0.814083 -0.705237 -1.627397 -2.385590 -0.144798
2022-01-11 -1.542745 -0.536425 0.484937 -0.486977 0.237764
2022-01-12 -0.882890 0.175792 -0.131634 0.699205 0.792908
2022-01-13 1.235072 0.640500 -1.283168 -1.422045 -0.402504
2022-01-14 -0.085713 0.117496 2.047838 0.240406 0.784371
2022-01-15 1.157494 -0.128045 -0.278789 0.296545 0.371711
2022-01-16 -0.884580 -1.623248 0.492578 0.088836 -0.049226
2022-01-17 -1.324278 -0.765477 0.907903 -0.808565 0.004943
2022-01-18 1.681379 -0.302969 -1.158782 1.267730 -2.336489
2022-01-19 0.821827 1.307620 -0.502119 -1.896293 -0.194628
2022-01-20 -0.875702 0.433110 0.883599 0.561136 -0.466486
In [ ]:
a=df1.to_numpy()
Out[ ]:
array([[ 0.38813495, -0.69450026,  1.71074173, -0.95385292,  1.64714073],
       [-0.51757465, -0.18814687,  0.63003587, -0.88199128, -0.98127153],
       [-1.67498296, -0.58663723, -1.34094674,  0.70266092, -0.92954028],
       [ 0.58190588, -0.08002672,  1.13707543,  0.77544498, -1.79604041],
       [-1.27382455,  0.31291439, -1.16531406, -0.1696396 , -0.36894076],
       [ 0.51560227,  0.49006838, -0.54523928, -0.40895489,  1.33362693],
       [ 0.16966288, -1.01929506, -0.66836035, -2.78698617, -0.19935927],
       [-0.13049661,  0.1027985 ,  0.28820314, -0.29138204, -0.36455922],
       [-0.36901525,  1.28390443,  0.83990284, -0.27945273, -1.27201643],
       [-0.81408326, -0.70523725, -1.62739717, -2.38558997, -0.14479825],
       [-1.54274494, -0.53642495,  0.48493728, -0.48697742,  0.2377637 ],
       [-0.88289022,  0.17579184, -0.13163351,  0.69920486,  0.79290814],
       [ 1.23507203,  0.64050031, -1.28316767, -1.42204502, -0.40250438],
       [-0.08571273,  0.1174964 ,  2.04783829,  0.2404063 ,  0.7843707 ],
       [ 1.15749355, -0.12804461, -0.27878909,  0.29654452,  0.37171123],
       [-0.88458023, -1.62324761,  0.4925782 ,  0.08883562, -0.04922586],
       [-1.32427812, -0.76547659,  0.90790288, -0.80856471,  0.00494294],
       [ 1.68137876, -0.30296923, -1.15878239,  1.26773   , -2.33648888],
       [ 0.82182693,  1.30762029, -0.50211862, -1.8962927 , -0.19462804],
       [-0.87570223,  0.43311031,  0.88359855,  0.56113551, -0.46648558]])
In [ ]:
a.shape
Out[ ]:
(9,)
In [ ]:
df2.to_numpy()
Out[ ]:
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)
In [ ]:
#details of data 
df1.describe()
Out[ ]:
A B C D E
count 20.000000 20.000000 20.000000 20.000000 20.000000
mean -0.191240 -0.088290 0.036053 -0.406988 -0.216670
std 0.984138 0.733233 1.064758 1.084846 0.969833
min -1.674983 -1.623248 -1.627397 -2.786986 -2.336489
25% -0.883313 -0.613603 -0.790966 -0.899957 -0.582249
50% -0.249756 -0.104036 0.078285 -0.285417 -0.196994
75% 0.532178 0.342963 0.850827 0.362692 0.271251
max 1.681379 1.307620 2.047838 1.267730 1.647141
In [ ]:
#to transpose the data 
df2.T
Out[ ]:
0 1 2 3
A 1.0 1.0 1.0 1.0
B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
C 1.0 1.0 1.0 1.0
D 3 3 3 3
E test train test train
F foo foo foo foo
In [ ]:
# Sorting
df1.sort_index(axis=0, ascending=False)
Out[ ]:
A B C D E
2022-01-20 -0.875702 0.433110 0.883599 0.561136 -0.466486
2022-01-19 0.821827 1.307620 -0.502119 -1.896293 -0.194628
2022-01-18 1.681379 -0.302969 -1.158782 1.267730 -2.336489
2022-01-17 -1.324278 -0.765477 0.907903 -0.808565 0.004943
2022-01-16 -0.884580 -1.623248 0.492578 0.088836 -0.049226
2022-01-15 1.157494 -0.128045 -0.278789 0.296545 0.371711
2022-01-14 -0.085713 0.117496 2.047838 0.240406 0.784371
2022-01-13 1.235072 0.640500 -1.283168 -1.422045 -0.402504
2022-01-12 -0.882890 0.175792 -0.131634 0.699205 0.792908
2022-01-11 -1.542745 -0.536425 0.484937 -0.486977 0.237764
2022-01-10 -0.814083 -0.705237 -1.627397 -2.385590 -0.144798
2022-01-09 -0.369015 1.283904 0.839903 -0.279453 -1.272016
2022-01-08 -0.130497 0.102798 0.288203 -0.291382 -0.364559
2022-01-07 0.169663 -1.019295 -0.668360 -2.786986 -0.199359
2022-01-06 0.515602 0.490068 -0.545239 -0.408955 1.333627
2022-01-05 -1.273825 0.312914 -1.165314 -0.169640 -0.368941
2022-01-04 0.581906 -0.080027 1.137075 0.775445 -1.796040
2022-01-03 -1.674983 -0.586637 -1.340947 0.702661 -0.929540
2022-01-02 -0.517575 -0.188147 0.630036 -0.881991 -0.981272
2022-01-01 0.388135 -0.694500 1.710742 -0.953853 1.647141
In [ ]:
df.sort_values(by="B")
Out[ ]:
A B C D E
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-12 -0.742913 -1.484980 -0.917490 -0.283180 -0.053965
2022-01-11 -0.637742 -1.434976 -1.276443 -1.153942 0.761682
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-31 -0.752144 -0.738474 0.248588 0.375347 0.729071
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-15 -0.436737 -0.262544 0.686336 0.144395 0.625462
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-27 -1.981745 0.243877 -0.122814 0.343524 1.155793
2022-01-22 -0.528683 0.258139 0.589976 0.325295 1.688333
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
2022-01-28 -0.023202 0.546315 -0.150933 -0.316904 0.220736
2022-01-26 -0.073498 0.611255 1.133547 0.358266 -0.694200
2022-01-21 -0.270772 0.698006 -0.801287 -0.182554 0.852357
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-16 -0.152087 0.789873 -1.096439 -0.172554 -1.109436
2022-01-19 -1.454019 0.974247 -0.442348 0.044247 1.190501
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-25 -0.344456 1.276411 -0.625939 -1.117180 -0.680828
In [ ]:
df.sort_values(by="B",ascending=False)
Out[ ]:
A B C D E
2022-01-25 -0.344456 1.276411 -0.625939 -1.117180 -0.680828
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-19 -1.454019 0.974247 -0.442348 0.044247 1.190501
2022-01-16 -0.152087 0.789873 -1.096439 -0.172554 -1.109436
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065
2022-01-21 -0.270772 0.698006 -0.801287 -0.182554 0.852357
2022-01-26 -0.073498 0.611255 1.133547 0.358266 -0.694200
2022-01-28 -0.023202 0.546315 -0.150933 -0.316904 0.220736
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-22 -0.528683 0.258139 0.589976 0.325295 1.688333
2022-01-27 -1.981745 0.243877 -0.122814 0.343524 1.155793
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-15 -0.436737 -0.262544 0.686336 0.144395 0.625462
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-31 -0.752144 -0.738474 0.248588 0.375347 0.729071
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-11 -0.637742 -1.434976 -1.276443 -1.153942 0.761682
2022-01-12 -0.742913 -1.484980 -0.917490 -0.283180 -0.053965
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
In [ ]:
df1["A"]
Out[ ]:
2022-01-01    0.388135
2022-01-02   -0.517575
2022-01-03   -1.674983
2022-01-04    0.581906
2022-01-05   -1.273825
2022-01-06    0.515602
2022-01-07    0.169663
2022-01-08   -0.130497
2022-01-09   -0.369015
2022-01-10   -0.814083
2022-01-11   -1.542745
2022-01-12   -0.882890
2022-01-13    1.235072
2022-01-14   -0.085713
2022-01-15    1.157494
2022-01-16   -0.884580
2022-01-17   -1.324278
2022-01-18    1.681379
2022-01-19    0.821827
2022-01-20   -0.875702
Freq: D, Name: A, dtype: float64
In [ ]:
#filteration data with coloum wise or indexwise
df1["B"]
Out[ ]:
2022-01-01   -0.694500
2022-01-02   -0.188147
2022-01-03   -0.586637
2022-01-04   -0.080027
2022-01-05    0.312914
2022-01-06    0.490068
2022-01-07   -1.019295
2022-01-08    0.102798
2022-01-09    1.283904
2022-01-10   -0.705237
2022-01-11   -0.536425
2022-01-12    0.175792
2022-01-13    0.640500
2022-01-14    0.117496
2022-01-15   -0.128045
2022-01-16   -1.623248
2022-01-17   -0.765477
2022-01-18   -0.302969
2022-01-19    1.307620
2022-01-20    0.433110
Freq: D, Name: B, dtype: float64
In [ ]:
# TO select data row wise 
df[0:1]
Out[ ]:
A B C D E
2022-01-01 -1.42466 0.77221 0.783466 -0.712659 -2.551065
In [ ]:
df[0:2]
Out[ ]:
A B C D E
2022-01-01 -1.424660 0.77221 0.783466 -0.712659 -2.551065
2022-01-02 1.382429 -1.70873 1.176728 1.949761 -0.492011
In [ ]:
df[0:10]
Out[ ]:
A B C D E
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
In [ ]:
df[1:10]
Out[ ]:
A B C D E
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
In [ ]:
df1.head()
Out[ ]:
A B C D E
2022-01-01 0.388135 -0.694500 1.710742 -0.953853 1.647141
2022-01-02 -0.517575 -0.188147 0.630036 -0.881991 -0.981272
2022-01-03 -1.674983 -0.586637 -1.340947 0.702661 -0.929540
2022-01-04 0.581906 -0.080027 1.137075 0.775445 -1.796040
2022-01-05 -1.273825 0.312914 -1.165314 -0.169640 -0.368941
In [ ]:
# showing the only 16th row  and its values 
df.loc[dates[15]]
Out[ ]:
A   -0.152087
B    0.789873
C   -1.096439
D   -0.172554
E   -1.109436
Name: 2022-01-16 00:00:00, dtype: float64
In [ ]:
#multiple axis lables 
df.loc[:, ["A","B"]]
Out[ ]:
A B
2022-01-01 -1.424660 0.772210
2022-01-02 1.382429 -1.708730
2022-01-03 0.325283 -0.834825
2022-01-04 0.552770 -0.454804
2022-01-05 -0.896864 -0.958084
2022-01-06 -3.103231 -0.340718
2022-01-07 1.769682 -1.402604
2022-01-08 0.660826 1.017961
2022-01-09 0.125091 -0.340940
2022-01-10 1.915190 0.314069
2022-01-11 -0.637742 -1.434976
2022-01-12 -0.742913 -1.484980
2022-01-13 0.201580 0.786547
2022-01-14 1.321449 0.050355
2022-01-15 -0.436737 -0.262544
2022-01-16 -0.152087 0.789873
2022-01-17 0.379340 -0.193847
2022-01-18 0.785079 -1.145713
2022-01-19 -1.454019 0.974247
2022-01-20 0.503909 -0.776441
2022-01-21 -0.270772 0.698006
2022-01-22 -0.528683 0.258139
2022-01-23 0.240170 -0.518060
2022-01-24 0.615130 -0.661989
2022-01-25 -0.344456 1.276411
2022-01-26 -0.073498 0.611255
2022-01-27 -1.981745 0.243877
2022-01-28 -0.023202 0.546315
2022-01-29 0.201237 0.241144
2022-01-30 0.073743 -0.597354
2022-01-31 -0.752144 -0.738474
2022-02-01 0.635539 0.457294
2022-02-02 1.761750 0.542268
In [ ]:
df.loc["20220109":"20220113",["A","B", "C"]]
Out[ ]:
A B C
2022-01-09 0.125091 -0.340940 1.023370
2022-01-10 1.915190 0.314069 -0.638422
2022-01-11 -0.637742 -1.434976 -1.276443
2022-01-12 -0.742913 -1.484980 -0.917490
2022-01-13 0.201580 0.786547 -1.459392
In [ ]:
df.loc["20220109",["A","B", "C"]]
Out[ ]:
A    0.125091
B   -0.340940
C    1.023370
Name: 2022-01-09 00:00:00, dtype: float64
In [ ]:
#Scalar value
df.at[dates[0],"A"]
Out[ ]:
-1.4246598703006963
In [ ]:
df.iloc[3]
Out[ ]:
A    0.552770
B   -0.454804
C   -0.747360
D    1.800993
E   -0.275458
Name: 2022-01-04 00:00:00, dtype: float64
In [ ]:
df.iloc[3:10]
Out[ ]:
A B C D E
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
In [ ]:
#       rows col
df.iloc[0:5, 0:2]
Out[ ]:
A B
2022-01-01 -1.424660 0.772210
2022-01-02 1.382429 -1.708730
2022-01-03 0.325283 -0.834825
2022-01-04 0.552770 -0.454804
2022-01-05 -0.896864 -0.958084
In [ ]:
df.iloc[:, 0:2]
Out[ ]:
A B
2022-01-01 -1.424660 0.772210
2022-01-02 1.382429 -1.708730
2022-01-03 0.325283 -0.834825
2022-01-04 0.552770 -0.454804
2022-01-05 -0.896864 -0.958084
2022-01-06 -3.103231 -0.340718
2022-01-07 1.769682 -1.402604
2022-01-08 0.660826 1.017961
2022-01-09 0.125091 -0.340940
2022-01-10 1.915190 0.314069
2022-01-11 -0.637742 -1.434976
2022-01-12 -0.742913 -1.484980
2022-01-13 0.201580 0.786547
2022-01-14 1.321449 0.050355
2022-01-15 -0.436737 -0.262544
2022-01-16 -0.152087 0.789873
2022-01-17 0.379340 -0.193847
2022-01-18 0.785079 -1.145713
2022-01-19 -1.454019 0.974247
2022-01-20 0.503909 -0.776441
2022-01-21 -0.270772 0.698006
2022-01-22 -0.528683 0.258139
2022-01-23 0.240170 -0.518060
2022-01-24 0.615130 -0.661989
2022-01-25 -0.344456 1.276411
2022-01-26 -0.073498 0.611255
2022-01-27 -1.981745 0.243877
2022-01-28 -0.023202 0.546315
2022-01-29 0.201237 0.241144
2022-01-30 0.073743 -0.597354
2022-01-31 -0.752144 -0.738474
2022-02-01 0.635539 0.457294
2022-02-02 1.761750 0.542268
In [ ]:
df[df["A"]> 0]
Out[ ]:
A B C D E
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df[df["A"]> 0]
Out[ ]:
A B C D E
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860

Assignment¶

In [ ]:
# assignment :Getting non zero values in more then one column 
df[ df.iloc[:, 0:5]> 0]
Out[ ]:
A B C D E
2022-01-01 NaN 0.772210 0.783466 NaN NaN
2022-01-02 1.382429 NaN 1.176728 1.949761 NaN
2022-01-03 0.325283 NaN NaN NaN 0.919688
2022-01-04 0.552770 NaN NaN 1.800993 NaN
2022-01-05 NaN NaN 0.579021 0.478870 0.260666
2022-01-06 NaN NaN 0.050150 0.058931 NaN
2022-01-07 1.769682 NaN NaN NaN NaN
2022-01-08 0.660826 1.017961 0.569261 NaN 1.056544
2022-01-09 0.125091 NaN 1.023370 NaN 0.322947
2022-01-10 1.915190 0.314069 NaN NaN NaN
2022-01-11 NaN NaN NaN NaN 0.761682
2022-01-12 NaN NaN NaN NaN NaN
2022-01-13 0.201580 0.786547 NaN 0.126138 NaN
2022-01-14 1.321449 0.050355 NaN NaN 1.676111
2022-01-15 NaN NaN 0.686336 0.144395 0.625462
2022-01-16 NaN 0.789873 NaN NaN NaN
2022-01-17 0.379340 NaN NaN NaN NaN
2022-01-18 0.785079 NaN 0.233175 NaN NaN
2022-01-19 NaN 0.974247 NaN 0.044247 1.190501
2022-01-20 0.503909 NaN NaN 0.813752 NaN
2022-01-21 NaN 0.698006 NaN NaN 0.852357
2022-01-22 NaN 0.258139 0.589976 0.325295 1.688333
2022-01-23 0.240170 NaN 0.402815 NaN NaN
2022-01-24 0.615130 NaN 1.052420 0.529098 NaN
2022-01-25 NaN 1.276411 NaN NaN NaN
2022-01-26 NaN 0.611255 1.133547 0.358266 NaN
2022-01-27 NaN 0.243877 NaN 0.343524 1.155793
2022-01-28 NaN 0.546315 NaN NaN 0.220736
2022-01-29 0.201237 0.241144 1.538946 NaN 2.243980
2022-01-30 0.073743 NaN NaN 0.624458 NaN
2022-01-31 NaN NaN 0.248588 0.375347 0.729071
2022-02-01 0.635539 0.457294 NaN NaN NaN
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df[df["A"]> 0]
Out[ ]:
A B C D E
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df[df>0]
Out[ ]:
A B C D E
2022-01-01 NaN 0.772210 0.783466 NaN NaN
2022-01-02 1.382429 NaN 1.176728 1.949761 NaN
2022-01-03 0.325283 NaN NaN NaN 0.919688
2022-01-04 0.552770 NaN NaN 1.800993 NaN
2022-01-05 NaN NaN 0.579021 0.478870 0.260666
2022-01-06 NaN NaN 0.050150 0.058931 NaN
2022-01-07 1.769682 NaN NaN NaN NaN
2022-01-08 0.660826 1.017961 0.569261 NaN 1.056544
2022-01-09 0.125091 NaN 1.023370 NaN 0.322947
2022-01-10 1.915190 0.314069 NaN NaN NaN
2022-01-11 NaN NaN NaN NaN 0.761682
2022-01-12 NaN NaN NaN NaN NaN
2022-01-13 0.201580 0.786547 NaN 0.126138 NaN
2022-01-14 1.321449 0.050355 NaN NaN 1.676111
2022-01-15 NaN NaN 0.686336 0.144395 0.625462
2022-01-16 NaN 0.789873 NaN NaN NaN
2022-01-17 0.379340 NaN NaN NaN NaN
2022-01-18 0.785079 NaN 0.233175 NaN NaN
2022-01-19 NaN 0.974247 NaN 0.044247 1.190501
2022-01-20 0.503909 NaN NaN 0.813752 NaN
2022-01-21 NaN 0.698006 NaN NaN 0.852357
2022-01-22 NaN 0.258139 0.589976 0.325295 1.688333
2022-01-23 0.240170 NaN 0.402815 NaN NaN
2022-01-24 0.615130 NaN 1.052420 0.529098 NaN
2022-01-25 NaN 1.276411 NaN NaN NaN
2022-01-26 NaN 0.611255 1.133547 0.358266 NaN
2022-01-27 NaN 0.243877 NaN 0.343524 1.155793
2022-01-28 NaN 0.546315 NaN NaN 0.220736
2022-01-29 0.201237 0.241144 1.538946 NaN 2.243980
2022-01-30 0.073743 NaN NaN 0.624458 NaN
2022-01-31 NaN NaN 0.248588 0.375347 0.729071
2022-02-01 0.635539 0.457294 NaN NaN NaN
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
df[ df.iloc[0:3]> 0 ]
Out[ ]:
A B C D E
2022-01-01 NaN 0.77221 0.783466 NaN NaN
2022-01-02 1.382429 NaN 1.176728 1.949761 NaN
2022-01-03 0.325283 NaN NaN NaN 0.919688
2022-01-04 NaN NaN NaN NaN NaN
2022-01-05 NaN NaN NaN NaN NaN
2022-01-06 NaN NaN NaN NaN NaN
2022-01-07 NaN NaN NaN NaN NaN
2022-01-08 NaN NaN NaN NaN NaN
2022-01-09 NaN NaN NaN NaN NaN
2022-01-10 NaN NaN NaN NaN NaN
2022-01-11 NaN NaN NaN NaN NaN
2022-01-12 NaN NaN NaN NaN NaN
2022-01-13 NaN NaN NaN NaN NaN
2022-01-14 NaN NaN NaN NaN NaN
2022-01-15 NaN NaN NaN NaN NaN
2022-01-16 NaN NaN NaN NaN NaN
2022-01-17 NaN NaN NaN NaN NaN
2022-01-18 NaN NaN NaN NaN NaN
2022-01-19 NaN NaN NaN NaN NaN
2022-01-20 NaN NaN NaN NaN NaN
2022-01-21 NaN NaN NaN NaN NaN
2022-01-22 NaN NaN NaN NaN NaN
2022-01-23 NaN NaN NaN NaN NaN
2022-01-24 NaN NaN NaN NaN NaN
2022-01-25 NaN NaN NaN NaN NaN
2022-01-26 NaN NaN NaN NaN NaN
2022-01-27 NaN NaN NaN NaN NaN
2022-01-28 NaN NaN NaN NaN NaN
2022-01-29 NaN NaN NaN NaN NaN
2022-01-30 NaN NaN NaN NaN NaN
2022-01-31 NaN NaN NaN NaN NaN
2022-02-01 NaN NaN NaN NaN NaN
2022-02-02 NaN NaN NaN NaN NaN
In [ ]:
# IS IN Method 
df3 = df.copy()
df3
Out[ ]:
A B C D E
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831
2022-01-11 -0.637742 -1.434976 -1.276443 -1.153942 0.761682
2022-01-12 -0.742913 -1.484980 -0.917490 -0.283180 -0.053965
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111
2022-01-15 -0.436737 -0.262544 0.686336 0.144395 0.625462
2022-01-16 -0.152087 0.789873 -1.096439 -0.172554 -1.109436
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845
2022-01-19 -1.454019 0.974247 -0.442348 0.044247 1.190501
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261
2022-01-21 -0.270772 0.698006 -0.801287 -0.182554 0.852357
2022-01-22 -0.528683 0.258139 0.589976 0.325295 1.688333
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316
2022-01-25 -0.344456 1.276411 -0.625939 -1.117180 -0.680828
2022-01-26 -0.073498 0.611255 1.133547 0.358266 -0.694200
2022-01-27 -1.981745 0.243877 -0.122814 0.343524 1.155793
2022-01-28 -0.023202 0.546315 -0.150933 -0.316904 0.220736
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179
2022-01-31 -0.752144 -0.738474 0.248588 0.375347 0.729071
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860
In [ ]:
#Adding a column
df3["BABA"]= [1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3]
df3
Out[ ]:
A B C D E BABA
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065 1
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011 2
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688 3
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458 4
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666 5
2022-01-06 -3.103231 -0.340718 0.050150 0.058931 -1.712098 6
2022-01-07 1.769682 -1.402604 -2.110711 -0.922010 -0.753877 7
2022-01-08 0.660826 1.017961 0.569261 -1.209834 1.056544 8
2022-01-09 0.125091 -0.340940 1.023370 -1.555463 0.322947 9
2022-01-10 1.915190 0.314069 -0.638422 -1.023607 -1.544831 10
2022-01-11 -0.637742 -1.434976 -1.276443 -1.153942 0.761682 1
2022-01-12 -0.742913 -1.484980 -0.917490 -0.283180 -0.053965 2
2022-01-13 0.201580 0.786547 -1.459392 0.126138 -0.312369 3
2022-01-14 1.321449 0.050355 -0.933186 -1.043294 1.676111 4
2022-01-15 -0.436737 -0.262544 0.686336 0.144395 0.625462 5
2022-01-16 -0.152087 0.789873 -1.096439 -0.172554 -1.109436 6
2022-01-17 0.379340 -0.193847 -1.390857 -1.056517 -0.158073 7
2022-01-18 0.785079 -1.145713 0.233175 -0.294474 -0.493845 8
2022-01-19 -1.454019 0.974247 -0.442348 0.044247 1.190501 9
2022-01-20 0.503909 -0.776441 -0.323965 0.813752 -0.409261 10
2022-01-21 -0.270772 0.698006 -0.801287 -0.182554 0.852357 1
2022-01-22 -0.528683 0.258139 0.589976 0.325295 1.688333 2
2022-01-23 0.240170 -0.518060 0.402815 -1.852271 -0.960517 3
2022-01-24 0.615130 -0.661989 1.052420 0.529098 -2.626316 4
2022-01-25 -0.344456 1.276411 -0.625939 -1.117180 -0.680828 5
2022-01-26 -0.073498 0.611255 1.133547 0.358266 -0.694200 6
2022-01-27 -1.981745 0.243877 -0.122814 0.343524 1.155793 7
2022-01-28 -0.023202 0.546315 -0.150933 -0.316904 0.220736 8
2022-01-29 0.201237 0.241144 1.538946 -0.113433 2.243980 9
2022-01-30 0.073743 -0.597354 -0.399961 0.624458 -0.305179 10
2022-01-31 -0.752144 -0.738474 0.248588 0.375347 0.729071 1
2022-02-01 0.635539 0.457294 -0.037721 -0.169999 -0.373402 2
2022-02-02 1.761750 0.542268 0.059822 0.817176 0.607860 3
In [ ]:
#Adding a new column having same value of previous column 
df3["Mean"] =df3["A"]
df3.head()
Out[ ]:
A B C D E BABA Mean
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065 1 -1.424660
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011 2 1.382429
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688 3 0.325283
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458 4 0.552770
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666 5 -0.896864

Assignment¶

In [ ]:
#Adding a column having mean of previous values
# #Assignment no 2  
df3["Mean"] =df3.mean(axis= 1)
df3.head()
Out[ ]:
A B C D E BABA Mean
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065 1 -0.508195
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011 2 0.812944
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688 3 0.388361
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458 4 0.775559
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666 5 0.509535
In [ ]:
df["New"]="new hai "
df.head()
Out[ ]:
A B C D E New
2022-01-01 -1.424660 0.772210 0.783466 -0.712659 -2.551065 new hai
2022-01-02 1.382429 -1.708730 1.176728 1.949761 -0.492011 new hai
2022-01-03 0.325283 -0.834825 -0.879866 -0.137038 0.919688 new hai
2022-01-04 0.552770 -0.454804 -0.747360 1.800993 -0.275458 new hai
2022-01-05 -0.896864 -0.958084 0.579021 0.478870 0.260666 new hai
In [ ]:
df.insert(2,"Beech me ", "18+")
df.head()
Out[ ]:
A B Beech me C D E New
2022-01-01 -1.424660 0.772210 18+ 0.783466 -0.712659 -2.551065 new hai
2022-01-02 1.382429 -1.708730 18+ 1.176728 1.949761 -0.492011 new hai
2022-01-03 0.325283 -0.834825 18+ -0.879866 -0.137038 0.919688 new hai
2022-01-04 0.552770 -0.454804 18+ -0.747360 1.800993 -0.275458 new hai
2022-01-05 -0.896864 -0.958084 18+ 0.579021 0.478870 0.260666 new hai
In [ ]:
df["Concatinated"]= df["A"]+df["B"]
df.head()
Out[ ]:
A B Beech me C D E New Concatinated
2022-01-01 -1.424660 0.772210 18+ 0.783466 -0.712659 -2.551065 new hai -0.652450
2022-01-02 1.382429 -1.708730 18+ 1.176728 1.949761 -0.492011 new hai -0.326301
2022-01-03 0.325283 -0.834825 18+ -0.879866 -0.137038 0.919688 new hai -0.509542
2022-01-04 0.552770 -0.454804 18+ -0.747360 1.800993 -0.275458 new hai 0.097965
2022-01-05 -0.896864 -0.958084 18+ 0.579021 0.478870 0.260666 new hai -1.854949

Chapter 6- Exploratory Data Analysis¶

This will show us how we can do EDA using python

Three important steps to keep in mind are :¶

  1. Understand the data
  2. Clean the data
  3. Find the realtionship
In [ ]:
#importing libraries 
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
In [ ]:
kashti= sns.load_dataset("titanic")
In [ ]:
kashti.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
In [ ]:
ks= kashti
In [ ]:
#just to see the datset 
ks.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [ ]:
ks.shape
#Rows x column 
Out[ ]:
(891, 15)
In [ ]:
ks.tail()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
886 0 2 male 27.0 0 0 13.00 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.00 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.45 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.00 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.75 Q Third man True NaN Queenstown no True
In [ ]:
ks.describe()
Out[ ]:
survived pclass age sibsp parch fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [ ]:
#  unique values 
ks.nunique()
Out[ ]:
survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64
In [ ]:
# coloumn names
ks.columns
Out[ ]:
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
In [ ]:
ks["sex"].unique()
Out[ ]:
array(['male', 'female'], dtype=object)
In [ ]:
ks['who'].unique()
Out[ ]:
array(['man', 'woman', 'child'], dtype=object)
In [ ]:
np.union1d(ks["who"].unique(), ks["sex"].unique())
Out[ ]:
array(['child', 'female', 'male', 'man', 'woman'], dtype=object)
In [ ]:
ks[['who', "sex"]].nunique()
Out[ ]:
who    3
sex    2
dtype: int64

cleaning and filtering the data¶

In [ ]:
# find missing valus inside 
ks.isnull()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 False False False False False False False False False False False True False False False
1 False False False False False False False False False False False False False False False
2 False False False False False False False False False False False True False False False
3 False False False False False False False False False False False False False False False
4 False False False False False False False False False False False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 False False False False False False False False False False False True False False False
887 False False False False False False False False False False False False False False False
888 False False False True False False False False False False False True False False False
889 False False False False False False False False False False False False False False False
890 False False False False False False False False False False False True False False False

891 rows × 15 columns

In [ ]:
ks.isnull().sum()
Out[ ]:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
In [ ]:
# removing missing value column 
ks_clean= ks.drop (["deck"], axis= 1)
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
ks_clean.isnull().sum()
Out[ ]:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
embark_town      2
alive            0
alone            0
dtype: int64
In [ ]:
ks_clean.shape
Out[ ]:
(891, 14)
In [ ]:
891-177-2
#117 row in age and 2 from embarked and embark town 
Out[ ]:
712
In [ ]:
ks_clean= ks_clean.dropna()
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
ks_clean.dropna().shape
Out[ ]:
(712, 14)
In [ ]:
ks_clean.isnull().sum()
Out[ ]:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64
In [ ]:
ks_clean.shape
Out[ ]:
(712, 14)
In [ ]:
ks.shape
Out[ ]:
(891, 15)
In [ ]:
ks_clean["age"].value_counts()
Out[ ]:
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: age, Length: 88, dtype: int64
In [ ]:
ks_clean["sex"].value_counts()
Out[ ]:
male      453
female    259
Name: sex, dtype: int64
In [ ]:
ks_clean["sex"].value_counts()
Out[ ]:
male      453
female    259
Name: sex, dtype: int64
In [ ]:
ks.describe()
Out[ ]:
survived pclass age sibsp parch fare
count 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [ ]:
ks_clean.describe()
Out[ ]:
survived pclass age sibsp parch fare
count 712.000000 712.000000 712.000000 712.000000 712.000000 712.000000
mean 0.404494 2.240169 29.642093 0.514045 0.432584 34.567251
std 0.491139 0.836854 14.492933 0.930692 0.854181 52.938648
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 0.000000 1.000000 20.000000 0.000000 0.000000 8.050000
50% 0.000000 2.000000 28.000000 0.000000 0.000000 15.645850
75% 1.000000 3.000000 38.000000 1.000000 1.000000 33.000000
max 1.000000 3.000000 80.000000 5.000000 6.000000 512.329200
In [ ]:
ks_clean.columns
Out[ ]:
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')
In [ ]:
sns.boxplot(x= "sex", y='age', data =ks_clean )
# here we can see the outlyers in age 
Out[ ]:
<AxesSubplot:xlabel='sex', ylabel='age'>
In [ ]:
sns.boxplot(y='age', data =ks_clean )
Out[ ]:
<AxesSubplot:ylabel='age'>
In [ ]:
sns.distplot(ks_clean["age"] )

# here we are seeing the bell curve / histogram for normality check 
# here we can see that it is not perfectly bell curve means data is not perfect 
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='age', ylabel='Density'>
In [ ]:
ks_clean["age"].mean()
Out[ ]:
29.64209269662921
In [ ]:
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
# Removing an out liers 
ks_clean= ks_clean[ks_clean["age"]< 68]
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
ks_clean.shape
Out[ ]:
(705, 14)
In [ ]:
ks_clean["age"].mean()
Out[ ]:
29.21797163120567
In [ ]:
sns.distplot(ks_clean["age"] )
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='age', ylabel='Density'>
In [ ]:
sns.boxplot(y='age', data =ks_clean )
Out[ ]:
<AxesSubplot:ylabel='age'>
In [ ]:
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
ks_clean.boxplot()
Out[ ]:
<AxesSubplot:>
In [ ]:
ks_clean= ks_clean[ks_clean["fare"]< 300]
In [ ]:
ks_clean.boxplot()
Out[ ]:
<AxesSubplot:>
In [ ]:
sns.distplot(ks_clean["fare"] )
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='fare', ylabel='Density'>
In [ ]:
ks_clean.hist()
Out[ ]:
array([[<AxesSubplot:title={'center':'survived'}>,
        <AxesSubplot:title={'center':'pclass'}>],
       [<AxesSubplot:title={'center':'age'}>,
        <AxesSubplot:title={'center':'sibsp'}>],
       [<AxesSubplot:title={'center':'parch'}>,
        <AxesSubplot:title={'center':'fare'}>]], dtype=object)
In [ ]:
pd.value_counts(ks_clean["survived"])
Out[ ]:
0    418
1    284
Name: survived, dtype: int64
In [ ]:
pd.value_counts(ks_clean["survived"]).plot.bar()
Out[ ]:
<AxesSubplot:>
In [ ]:
pd.value_counts(ks_clean["sex"]).plot.bar()
Out[ ]:
<AxesSubplot:>
In [ ]:
ks_clean.groupby(["sex","class","who"]).mean()
Out[ ]:
survived pclass age sibsp parch fare adult_male alone
sex class who
female First child 0.666667 1.0 10.333333 0.666667 1.666667 160.962500 0.0 0.000000
man NaN NaN NaN NaN NaN NaN NaN NaN
woman 0.974684 1.0 35.139241 0.556962 0.468354 101.521730 0.0 0.367089
Second child 1.000000 2.0 6.600000 0.700000 1.300000 29.240000 0.0 0.000000
man NaN NaN NaN NaN NaN NaN NaN NaN
woman 0.906250 2.0 32.179688 0.468750 0.515625 20.812175 0.0 0.468750
Third child 0.533333 3.0 7.100000 1.533333 1.100000 19.023753 0.0 0.166667
man NaN NaN NaN NaN NaN NaN NaN NaN
woman 0.430556 3.0 27.854167 0.527778 0.888889 14.563542 0.0 0.458333
male First child 1.000000 1.0 5.306667 0.666667 2.000000 117.802767 0.0 0.000000
man 0.369565 1.0 41.201087 0.380435 0.282609 61.110824 1.0 0.543478
woman NaN NaN NaN NaN NaN NaN NaN NaN
Second child 1.000000 2.0 2.258889 0.888889 1.222222 27.306022 0.0 0.000000
man 0.067416 2.0 33.179775 0.325843 0.146067 20.606133 1.0 0.696629
woman NaN NaN NaN NaN NaN NaN NaN NaN
Third child 0.321429 3.0 6.515000 2.821429 1.321429 27.716371 0.0 0.035714
man 0.130045 3.0 28.607623 0.201794 0.125561 10.249231 1.0 0.825112
woman NaN NaN NaN NaN NaN NaN NaN NaN
In [ ]:
cor_ks_clean= ks_clean.corr()
cor_ks_clean
Out[ ]:
survived pclass age sibsp parch fare adult_male alone
survived 1.000000 -0.356549 -0.074335 -0.014483 0.095426 0.273531 -0.554567 -0.201175
pclass -0.356549 1.000000 -0.365121 0.061354 0.022519 -0.617591 0.102930 0.156030
age -0.074335 -0.365121 1.000000 -0.308906 -0.186271 0.103100 0.275035 0.187284
sibsp -0.014483 0.061354 -0.308906 1.000000 0.381803 0.197954 -0.311622 -0.629200
parch 0.095426 0.022519 -0.186271 0.381803 1.000000 0.259948 -0.366540 -0.574701
fare 0.273531 -0.617591 0.103100 0.197954 0.259948 1.000000 -0.228675 -0.333949
adult_male -0.554567 0.102930 0.275035 -0.311622 -0.366540 -0.228675 1.000000 0.402214
alone -0.201175 0.156030 0.187284 -0.629200 -0.574701 -0.333949 0.402214 1.000000
In [ ]:
sns.heatmap(cor_ks_clean)
Out[ ]:
<AxesSubplot:>
In [ ]:
sns.heatmap(cor_ks_clean , annot=True)
Out[ ]:
<AxesSubplot:>
In [ ]:
sns.relplot(x= "age", y="fare", hue="sex", data=ks_clean)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b2f22ab90>
In [ ]:
sns.catplot(x= "sex", y="fare",data=ks_clean)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b2f39ed10>
In [ ]:
sns.catplot(x= "sex", y="fare",data=ks_clean, kind= "bar")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b2c0b28c0>
In [ ]:
sns.catplot(x= "sex", y="age", hue= "who" , data=ks_clean, kind= "box")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b2c69dc00>
In [ ]:
sns.catplot(x= "sex", y="fare", hue= "who" , data=ks_clean, kind= "box")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b3066b850>
In [ ]:
ks_clean.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True
In [ ]:
# log transformation 
ks_clean["fare_log"] = np.log(ks_clean["fare"])
ks_clean.head()
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male embark_town alive alone fare_log
0 0 3 male 22.0 1 0 7.2500 S Third man True Southampton no False 1.981001
1 1 1 female 38.0 1 0 71.2833 C First woman False Cherbourg yes False 4.266662
2 1 3 female 26.0 0 0 7.9250 S Third woman False Southampton yes True 2.070022
3 1 1 female 35.0 1 0 53.1000 S First woman False Southampton yes False 3.972177
4 0 3 male 35.0 0 0 8.0500 S Third man True Southampton no True 2.085672
In [ ]:
sns.catplot(x= "sex", y="fare_log" , data=ks_clean, kind= "box")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x25b306c1510>

Chapter 7 - Data Wrangling¶

In [ ]:
# import libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
In [ ]:
kashti = sns.load_dataset("titanic")
ks1 = kashti 
# # ks2 = kashti 
In [ ]:
kashti.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
In [ ]:
# simple operations (Math operations)
(kashti["age"]+12).head(10) 
Out[ ]:
0    34.0
1    50.0
2    38.0
3    47.0
4    47.0
5     NaN
6    66.0
7    14.0
8    39.0
9    26.0
Name: age, dtype: float64
In [ ]:
(kashti["age"]*2).head(3) 
Out[ ]:
0    44.0
1    76.0
2    52.0
Name: age, dtype: float64

Dealing with missing values¶

  • In a dat set missing values are either ? or N/A or NAN , or 0 or a blank cell.
  • Jab kabhi data na ho kisi aik row main kisi b aik parameter ka

Steps:

1- Koshsih kren dobara data collect kar len agar kahin ghalti hai.

2- missing value wala variable (coloumn) hi nikal den agr data per effect nahi hta ya simple row or data entry remove kr den .

3- Replace the missing values :

- How?
    1. Average value of entire variable or similar data poit 
    2. frequency or MODE replacement 
    3. Replace based on other functions (Data sampler knows that)
    4. ML algorithm can also be used 
    5. Leave it like that 

- Why?
    1. Its better beacuse no data is lost 
    2. Less accurate
In [ ]:
# where exactly missing value are ?
kashti.isnull().sum()
Out[ ]:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
In [ ]:
# use a drop.na method 
print (kashti.shape)
(891, 15)
In [ ]:
# removing missing value column 
# ks_clean= ks.drop (["deck"], axis= 1)
# ks_clean.head()
In [ ]:
kashti.dropna(subset=["deck"], axis=0 , inplace = True)
kashti.shape
Out[ ]:
(203, 15)
In [ ]:
kashti.isnull().sum()
Out[ ]:
survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64
In [ ]:
# removing na from whole dataframe
kashti = kashti.dropna()
kashti.isnull().sum()
Out[ ]:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64
In [ ]:
kashti.shape
Out[ ]:
(182, 15)
In [ ]:
ks1.isnull().sum()
Out[ ]:
survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

Replacing missing values with the average of that column¶

In [ ]:
# finding an average (mean)
mean = ks1["age"].mean()
mean
Out[ ]:
35.77945652173913
In [ ]:
# replacing Nan with mean of the data (udating as well)
ks1["age"] = ks1["age"].replace(np.nan , mean)
In [ ]:
ks1["age"].head(10)
Out[ ]:
1     38.000000
3     35.000000
6     54.000000
10     4.000000
11    58.000000
21    34.000000
23    28.000000
27    19.000000
31    35.779457
52    49.000000
Name: age, dtype: float64
In [ ]:
ks1.isnull().sum()
Out[ ]:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
dtype: int64

Assignment¶

In [ ]:
mode = ks1["deck"].mode()[0]
mode
Out[ ]:
'C'
In [ ]:
ks1['deck'].fillna(mode , inplace=True)
ks1['deck'].head(12)
Out[ ]:
1     C
3     C
6     E
10    G
11    C
21    D
23    A
27    C
31    B
52    D
54    B
55    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']
In [ ]:
ks1.dropna(subset=["embarked"], axis=0 , inplace = True)
In [ ]:
ks1.isnull().sum()
Out[ ]:
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

some others

cols = ["workclass", "native-country"]
df[cols]=df[cols].fillna(df.mode().iloc[0])

or we can use a mapping

ks1['deck'].map({"A" : np.nan})

Data Formatting¶

  • Data ko aik common standard per lana
  • Ensure data is consistent and understandable
    • Easy to gather
    • Easy to work with
      • Faisalabad (FSD)
      • Lahore (LHR)
      • Islamabad (ISB)
      • karachi (KCH)
      • Peshawar (PEW)
      • converting g to kg or similar uni for all
      • one standard unit in each column
      • ft != cm
In [ ]:
# know the data type and convert it into the know one 
kashti.dtypes
Out[ ]:
survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object
In [ ]:
# use this method to convert datatype from one to another format 
kashti["survived"]= kashti["survived"].astype("float64")
kashti.dtypes
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/2743140032.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti["survived"]= kashti["survived"].astype("float64")
Out[ ]:
survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object
In [ ]:
# here we will convert age into days instead of years 
ks1["age"] = ks1["age"]*365
ks1["age"].head(8)
Out[ ]:
1     13870.0
3     12775.0
6     19710.0
10     1460.0
11    21170.0
21    12410.0
23    10220.0
27     6935.0
Name: age, dtype: float64
In [ ]:
# always rename afterwards 
ks1.rename(columns={"age": "age in days"}, inplace=True)
ks1.head()
Out[ ]:
survived pclass sex age in days sibsp parch fare embarked class who adult_male deck embark_town alive alone
1 1 1 female 13870.0 1 0 71.2833 C First woman False C Cherbourg yes False
3 1 1 female 12775.0 1 0 53.1000 S First woman False C Southampton yes False
6 0 1 male 19710.0 0 0 51.8625 S First man True E Southampton no True
10 1 3 female 1460.0 1 1 16.7000 S Third child False G Southampton yes False
11 1 1 female 21170.0 0 0 26.5500 S First woman False C Southampton yes True
In [ ]:
ks1["age in days"]= ks1["age in days"].astype("int64")
ks1.head()
Out[ ]:
survived pclass sex age in days sibsp parch fare embarked class who adult_male deck embark_town alive alone
1 1 1 female 13870 1 0 71.2833 C First woman False C Cherbourg yes False
3 1 1 female 12775 1 0 53.1000 S First woman False C Southampton yes False
6 0 1 male 19710 0 0 51.8625 S First man True E Southampton no True
10 1 3 female 1460 1 1 16.7000 S Third child False G Southampton yes False
11 1 1 female 21170 0 0 26.5500 S First woman False C Southampton yes True

Data Normailization¶

  • Uniform the data
  • Making sure they have same impact
  • AIk machli samundar me or aik jar main
  • Also for computational reasons
In [ ]:
ks1.head()
Out[ ]:
survived pclass sex age in days sibsp parch fare embarked class who adult_male deck embark_town alive alone
1 1 1 female 13870 1 0 71.2833 C First woman False C Cherbourg yes False
3 1 1 female 12775 1 0 53.1000 S First woman False C Southampton yes False
6 0 1 male 19710 0 0 51.8625 S First man True E Southampton no True
10 1 3 female 1460 1 1 16.7000 S Third child False G Southampton yes False
11 1 1 female 21170 0 0 26.5500 S First woman False C Southampton yes True
In [ ]:
ks4= ks1[["age in days","fare"]]
ks4.head()
Out[ ]:
age in days fare
1 13870 71.2833
3 12775 53.1000
6 19710 51.8625
10 1460 16.7000
11 21170 26.5500
  • The above data is really in wide range and we need to normalize and hard to compare
  • Normalization change the values to the range of 0-1 (now both variable has similar influence on our models)

Method of Normalization¶

  1. Simple feature scaling
    • x(new)= x(old)/x(max )
  2. Min-Max Method
  3. Z-score (standard score) -3 to +3
  4. Log transformation
In [ ]:
# simple feature scalling 
ks4["fare"]=  ks4["fare"]/ks4["fare"].max()
ks4["age in days"]=  ks4["age in days"]/ks4["age in days"].max()
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/607069502.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"]=  ks4["fare"]/ks4["fare"].max()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/607069502.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["age in days"]=  ks4["age in days"]/ks4["age in days"].max()
Out[ ]:
age in days fare
1 0.4750 0.139136
3 0.4375 0.103644
6 0.6750 0.101229
10 0.0500 0.032596
11 0.7250 0.051822
In [ ]:
# Min- Max method 
# x.new = (x.old - x.min) / (x.max- x.min)
ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/4070571851.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())
Out[ ]:
age in days fare
1 0.4750 0.139136
3 0.4375 0.103644
6 0.6750 0.101229
10 0.0500 0.032596
11 0.7250 0.051822
In [ ]:
# z-score method 
# x.new = (x.old -x.mean) / x.std
ks4["fare"]  =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()
ks4.head()
C:\Users\Epazz\AppData\Local\Temp/ipykernel_12924/3694817774.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"]  =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()
Out[ ]:
age in days fare
1 0.4750 -0.067057
3 0.4375 -0.309853
6 0.6750 -0.326377
10 0.0500 -0.795891
11 0.7250 -0.664367
In [ ]:
ks =sns.load_dataset("titanic")
ks["fare"].head() 
Out[ ]:
0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: fare, dtype: float64
In [ ]:
# log transfromation
# x.new = np.log(x.old) 
ks["fare"] = np.log(ks["fare"])
ks["fare"].head()
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)
Out[ ]:
0    1.981001
1    4.266662
2    2.070022
3    3.972177
4    2.085672
Name: fare, dtype: float64

Binning¶

  • Grouping of values into smaller number of vvalues (bins)
  • Convert numeric into categories ( jawan, achay , booray ) or 1- 16, 17-40 etc
  • to have a better understanding of groups
    • low vs mid vs high price
In [ ]:
kashti.head()
Out[ ]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
1 1.0 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
3 1.0 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
6 0.0 1 male 54.0 0 0 51.8625 S First man True E Southampton no True
10 1.0 3 female 4.0 1 1 16.7000 S Third child False G Southampton yes False
11 1.0 1 female 58.0 0 0 26.5500 S First woman False C Southampton yes True
In [ ]:
ks4["age in days"].shape
Out[ ]:
(201,)
In [ ]:
# Creating bins 
bins = np.linspace(min(ks1["age in days"]), max(ks1["age in days"]), 4)
bins
Out[ ]:
array([  335.        ,  9956.66666667, 19578.33333333, 29200.        ])
In [ ]:
age_groups = ["bachay", "jawan", "Boorhay"]
ks1["age in days"] = pd.cut(ks1["age in days"], bins, labels= age_groups ,include_lowest=True )
ks1["age in days"]
Out[ ]:
1        jawan
3        jawan
6      Boorhay
10      bachay
11     Boorhay
        ...   
871      jawan
872      jawan
879    Boorhay
887     bachay
889     bachay
Name: age in days, Length: 201, dtype: category
Categories (3, object): ['bachay' < 'jawan' < 'Boorhay']
In [ ]:
ks4.head()
Out[ ]:
age in days fare
1 0.4750 -0.067057
3 0.4375 -0.309853
6 0.6750 -0.326377
10 0.0500 -0.795891
11 0.7250 -0.664367

Converting catagories into dummies¶

  • easy to use for computation
  • male, Femal (0,1)
In [ ]:
ks1.head()
Out[ ]:
survived pclass sex age in days sibsp parch fare embarked class who adult_male deck embark_town alive alone
1 1 1 female jawan 1 0 71.2833 C First woman False C Cherbourg yes False
3 1 1 female jawan 1 0 53.1000 S First woman False C Southampton yes False
6 0 1 male Boorhay 0 0 51.8625 S First man True E Southampton no True
10 1 3 female bachay 1 1 16.7000 S Third child False G Southampton yes False
11 1 1 female Boorhay 0 0 26.5500 S First woman False C Southampton yes True
In [ ]:
ks5 = sns.load_dataset("titanic")
In [ ]:
data = pd.get_dummies(ks5["sex"])
In [ ]:
# Drop column 
ks5 = ks5.drop('sex',axis = 1)
# Join 
ks5 = ks5.join(data)
ks5.head() 
Out[ ]:
survived pclass age sibsp parch fare embarked class who adult_male deck embark_town alive alone female male
0 0 3 22.0 1 0 7.2500 S Third man True NaN Southampton no False 0 1
1 1 1 38.0 1 0 71.2833 C First woman False C Cherbourg yes False 1 0
2 1 3 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True 1 0
3 1 1 35.0 1 0 53.1000 S First woman False C Southampton yes False 1 0
4 0 3 35.0 0 0 8.0500 S Third man True NaN Southampton no True 0 1

Chapter 8- Statistics¶

Types of Experimental Designs | Statistics¶

1. Completely Randomized Design (CRD):¶

The design which is used when the experimental material is limited and homogeneous is known as completely randomized design. This design is specially used for pot culture experiments.

The whole field is divided into plots of similar shape and size. The number of plots is equal to the product of treatments and replications. These plots are then serially numbered

2. Randomized Block Design (RBD):¶

The experimental design which controls the fertility variation in one direction only is known as randomized block design (RBD). Adoption of this design is useful when the variation between the blocks is significant.

First the experimental field is divided into homogeneous groups equal to the number of replications. These homogeneous groups are known as blocks. Then each block is further divided into plots of similar shape and size equal to the number of treatments.

3. Latin Square Design (LSD):¶

The experimental design which simultaneously controls the fertility variation in two directions is called Latin square design (LSD). In other words, Latin square designs are adopted for eliminating the variation of two factors which are generally called rows and columns.

In this design the field is divided into homogeneous blocks in two ways. The blocks in one direction are commonly known as rows and the blocks in other direction as columns. The number of plots in each row is the same as the number of plots in each column. This number is equal to the number of treatments.

4. Split Plot Design (SPD):¶

The experimental design in which experimental plots are split or divided into main plots, sub­plots and ultimate-plots is called split plot design (SPD). In this design several factors are studied simultaneously with different levels of precision. The factors are such that some of them require larger plots like irrigation, depth of ploughing and sowing dates, and others require smaller plots.

The layout of this design consists of four steps as given below:

(a) First the experimental field is divided into homogeneous blocks equal to the number of replications

(b) Then each block is divided into a number of plots equal to the number of levels of the first factor. These plots are known as main plots.

(c) Then each main plot is divided into a number of sub-plots equal to the number of levels of second factor.

(d) Then each sub-plot is divided into a number of ultimate plots equal to the number of levels of third factor.

5. Lattice Design:¶

Lattice designs are incomplete block designs in which the number of varieties or treatments forms a square.

The experimental field is divided into homogeneous parts equal to the number of replications. Each part is further divided into plots of equal size in such a way that the number of plots should form a square and each replication has equal plots in each direction (i.e., equal rows and columns).

6. Augmented Designs:¶

This is an experimental design which is used to test a large number of germplasm lines in a limited area.

In this design, standard or check varieties are replicated among the cultures. Thus, standards are replicated and cultures are non-replicated. The number of check varieties should be at least 4.

Tests and their types¶

  • Parametric test
  • Non-Parametric test

Parametric test¶

Parametric tests are those tests for which we have prior knowledge of the population distribution

Non-Parametric test¶

Non-Parametric tests are those in which we don’t make any assumption about the parameters for the given population

Steps before data analysis¶

Step 1 : Normality test¶

Test to be used are :
    1- Shapiro-Wilk Test 
        * Specififc (Reliable)
    2- Kolmogorov-Simirnov Test
        * General (Less- reliable)

Step 2 : Homogeneity test¶

The variance of the variable in data are equal 
Tesy to be used : **Levene's Test**

Step 3 : Purpose of test¶

Know the purpose of the research question 
Their are two types of purpose 
    1- Comparison
    2- Relationship 

Step 4 : Type of the data¶

Know the type of the data 
    - Catagorical = Qualitative 
    - Numerical = Quantitative 

Step 5 : Statistical test¶

Choose a statistical test from three main families 
    1- Chi-Squared test 
        Purpose : Comparison 
        Data : Catagorical only 
        Types:
            * Chi- squared test of homogeneity
            * Chi- squared test of independence  
    2- t-test/ANOVA
        Purpose : Comparison 
        Data : Catagorical and continuous
        Types: 
            * One sample t-test 
            * Two sample t-test
                ** Un-paired t-test
                ** Paired t-test
            * ANOVA
                 ** One way ANOVA
                 ** Two way ANOVA
                 ** Repeated measures of ANOVA 
            * MANOVA
            * MANCOVA 
    3- Correlation 
        Purpose : Relationship 
        Data : Continuous only 
        Types:
            1- Pearson Correlation
            2- Regression

Definations¶

  • Chi-Squared test

      Chi-square is a statistical test used to examine the differences between categorical variables from a random sample in order to judge goodness of fit between expected and observed results.This is non parametric test.
  • One sample t-test

      The One Sample t Test examines whether the mean of a population is statistically different from a known or hypothesized value. The One Sample t Test is a parametric test.
  • Two sample t-test or Independent Samples t-Test

      The Independent Samples t Test compares the means of two independent groups in order to determine whether there is statistical evidence that the associated population means are significantly different. The Independent Samples t Test is a parametric test.

Unpaired t-test = Comparison between math marks of girls and boys (comparison between different population is onvolve)

Paired t-test = Comparison betwwen the math and stat marks of boys (boy= pne type of population and comparison is envolve in thier subjects)

  • ANOVA = Analysis of varianvce
  • One way ANOVA

      "one-way" ANOVA compares levels (i.e. groups) of a single factor based on single continuous response variable (e.g. comparing *test score*(continous variable) by 'level of education')
    
      One factor and one continous variable
  • Two way ANOVA

      a "two-way" ANOVA compares levels of two or more factors for mean differences on a single continuous response variable(e.g. comparing test score by both 'level of education' and 'zodiac sign').
    
      Two factor for one continous variable 
  • Repeated measures of ANOVA

      The repeated measures ANOVA compares means across one or more variables that are based on repeated observations. A repeated measures ANOVA model can also include zero or more independent variables. Again, a repeated measures ANOVA has at least 1 dependent variable that has more than one observation.
  • MANOVA "Multivariate Analysis of Variance"

      In basic terms, A MANOVA is an ANOVA with two or more continuous response variables
  • One Way MANOVA

     When comparing two or more continuous response variables by a single factor, a one-way MANOVA is appropriate (e.g. comparing ‘test score’ and ‘annual income’ together by ‘level of education’).
    
     Two continous variable with one factor
  • Two way MANOVA

     A two-way MANOVA also entails two or more continuous response variables, but compares them by at least two factors (e.g. comparing ‘test score’ and ‘annual income’ together by both ‘level of education’ and ‘zodiac sign’). 
    
     Two continous variable with two factors
  • MANCOVA (Multi-variate analysis of co-variance)

      an analysis evolves from MANOVA to MANCOVA when one or more covariates are added to the mix. 
      eg: MANCOVA compares two or more continuous response variables (e.g.comparing continous variable (Test Scores and Annual Income) by levels of a factor variable (e.g. Level of Education), controlling for a covariate (e.g. Number of Hours Spent Studying).

Stat Tests¶

Shapiro Wilk test¶

Test whether a data sample has gaussian distribution.

Assumptions¶

  1. Observation in each sample are independant and equally distributed
  2. Interpretation
  • Null Hypothesis (H0) = the sample has normal Gaussian distribution
  • Alternate Hhypothesis (H1) = the sample does not have Gaussian/Normal distribution
In [ ]:
from scipy.stats import shapiro 
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
shapiro(data)
#here we see that the p value is greater than 0.05 so the data is normally distributed 
Out[ ]:
ShapiroResult(statistic=0.8951009511947632, pvalue=0.19340917468070984)
In [ ]:
from scipy.stats import shapiro 
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print("stat =",stat)
print("p =" ,p)
stat = 0.8951009511947632
p = 0.19340917468070984
In [ ]:
# To write it respectively 
from scipy.stats import shapiro 
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print ("stat=%3f, p= %3f" %(stat, p))
# print("stat =",stat)
# print("p =" ,p)
stat=0.895101, p= 0.193409
In [ ]:
#example of the Shapiro-WIlk Normality test 
from scipy.stats import shapiro 
data = [0.873, 2.817 , 0.121, -0.945, -0.055, -1.436, 0.360 , -1.478, -1.637, -1.869]
stat, p=shapiro(data)
print ("stat=%3f, p= %3f" %(stat, p))

if p> 0.05:
    print("probability Gaussian")
else: 
    print ("Probability not gaussian")
stat=0.895101, p= 0.193409
probability Gaussian
In [ ]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 
In [ ]:
ks = sns.load_dataset("titanic")
sns.boxplot(ks["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='age'>
In [ ]:
ks["age"].hist()
Out[ ]:
<AxesSubplot:>
In [ ]:
# Normality test 
from scipy.stats import shapiro 
stat, p=shapiro(ks["age"])
print("stat =",stat)
print("p =" ,p)

if p> 0.05:
    print("probability Gaussian or the data is normal ")
else: 
    print ("Probability not gaussian or the data is not normal ")
stat = nan
p = 1.0
probability Gaussian or the data is normal 
In [ ]:
from scipy.stats import shapiro 
ks =ks.dropna()
stat, p=shapiro(ks["age"])

print("stat =",stat)
print("p =" ,p)

if p> 0.05:
    print("probability Gaussian or the data is normal ")
else: 
    print ("Probability not gaussian or the data is not normal ")
stat = 0.9906661510467529
p = 0.28414419293403625
probability Gaussian or the data is normal 
In [ ]:
sns.boxplot(ks["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='age'>
In [ ]:
from scipy.stats import shapiro 
ks =ks.dropna()
stat, p=shapiro(ks["fare"])

print("stat =",stat)
print("p =" ,p)

if p> 0.05:
    print("probability Gaussian or the data is normal ")
else: 
    print ("Probability not gaussian or the data is not normal ")
stat = 0.7430529594421387
p = 1.6486953687823121e-16
Probability not gaussian or the data is not normal 
In [ ]:
sns.boxplot(ks["fare"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='fare'>
In [ ]:
ks["fare"].hist()
Out[ ]:
<AxesSubplot:>

2- Correlation test¶

1. Pearson's Correlation Coefficient¶

Test whether two samples have a linear relationship

Asumptions¶

  1. Observations in each sample are independant and identically distributed (iid)
  2. Observation in each sample are normally distributed
  3. observations in each sample have the same variance
  4. interpretation
    • H0 : the two samples are independant
    • H1 : there is a dependancy between the samples.
In [ ]:
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [0.353, 3.517, 0.125, -7.545, -0.555, -1.536, 3.350, -1.578, -3.537, -1.579]
stat, p = pearsonr(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably independent(No correlation)')
else:
  print('Probably dependent (correlation exists)')
stat=0.688, p=0.028
Probably dependent (correlation exists)
In [ ]:
ks1 = sns.load_dataset("titanic")
In [ ]:
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
stat, p = pearsonr(ks1["age"], ks1["fare"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably independent')
else:
  print('Probably dependent')
  # here there is error as the array must not contain Nan values so we should drop them first 
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_892/996795437.py in <module>
      1 # Example of the Pearson's Correlation test
      2 from scipy.stats import pearsonr
----> 3 stat, p = pearsonr(ks1["age"], ks1["fare"])
      4 print('stat=%.3f, p=%.3f' % (stat, p))
      5 if p > 0.05:

~\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\stats.py in pearsonr(x, y)
   4043     # scipy.linalg.norm(xm) does not overflow if xm is, for example,
   4044     # [-5e210, 5e210, 3e200, -3e200]
-> 4045     normxm = linalg.norm(xm)
   4046     normym = linalg.norm(ym)
   4047 

~\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\linalg\misc.py in norm(a, ord, axis, keepdims, check_finite)
    143     # Differs from numpy only in non-finite handling and the use of blas.
    144     if check_finite:
--> 145         a = np.asarray_chkfinite(a)
    146     else:
    147         a = np.asarray(a)

~\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\lib\function_base.py in asarray_chkfinite(a, dtype, order)
    601     a = asarray(a, dtype=dtype, order=order)
    602     if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
--> 603         raise ValueError(
    604             "array must not contain infs or NaNs")
    605     return a

ValueError: array must not contain infs or NaNs
In [ ]:
# Example of the Pearson's Correlation test
from scipy.stats import pearsonr
ks1= ks1.dropna()
stat, p = pearsonr(ks1["age"], ks1["fare"])
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably independent')
else:
  print('Probably dependent')
stat=-0.091, p=0.223
Probably independent

2- Spearman's Rank Correlation¶

  • Non parametric analysis (if there is no normal distribution)
  • Test whether two samples have a monotonic relationship ### Assumptions
    1. Observations in each sample are independent and identically distributed (iid).
    2. Observations in each sample can be ranked.
    3. Interpertation
  • H0 : the two samples are independent
  • H1 : there is a dependancy between the samples

3- Chi-Squared Test¶

Test whether two catagorical variables are related or independant

Assumptions¶

  • Observation used in the calculation of the contigency table are independant.
  • 25 or more examples in each cell oh the contigency atble .
  • interpretation
  • H0 : The twwo samples are independant
  • H1 : there in dependancy between the samples

3- Parametric Statistical Hypothesis Test¶

1- Student's t-test¶

Test whether the means of two independent samples are significantly different

Asumptions¶

  1. Observations in each sample are independant and identically distributed (iid)
  2. Observation in each sample are normally distributed
  3. observations in each sample have the same variance
  4. interpretation
    • H0 : the means of the samples are equal
    • H1 : the means of the saamples are unequal.
In [ ]:
# Example of the Student's t-test
from scipy.stats import ttest_ind
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_ind(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably the same dataset')
else:
  print('Probably different data set')
stat=-0.326, p=0.748
Probably the same dataset

2- Paired Student's t-test¶

Test whether the means of two paired samples are significantly different.

Asumptions¶

  1. Observations in each sample are independant and identically distributed (iid)
  2. Observation in each sample are normally distributed
  3. observations in each sample have the same variance
  4. Observations across each sample are paired.
  5. interpretation
    • H0 : the means of the samples are equal
    • H1 : the means of the saamples are unequal.
In [ ]:
# Example of the Paired Student's t-test
from scipy.stats import ttest_rel
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_rel(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably the same distribution')
else:
  print('Probably different distributions')
stat=-0.334, p=0.746
Probably the same distribution

4- Analysis of Variance Test (ANOVA)¶

Tests whether the means of two or more independent samples are significantly different.

Asumptions¶

  1. Observations in each sample are independant and identically distributed (iid)
  2. Observation in each sample are normally distributed
  3. observations in each sample have the same variance
  4. interpretation
    • H0 : the means of the samples are equal
    • H1 : the means of the saamples are unequal.
In [ ]:
# Example of the Analysis of Variance test 
from scipy.stats import f_oneway
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
stat, p = f_oneway(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
  print('Probably the same distribution')
else:
  print('Probably different distributions')
stat=0.096, p=0.908
Probably the same distribution
In [ ]:
# Some other test 
# Post hoc tests
#  - tuckey test
#  - Bonfroni test 
# Two way ANOVA
# MANOVA
# MANCOVA

Statistics Notations¶

$\sum$ = Summation

$n$ = Sample size

$N$ = Population size

$X$ = Individual value

$X_1$ =Particular first value

$X_i$ = For each individual value

$p$ = Proportion of sample data

$P$ = Proportion of polpulation data

$\bar{x}$ = Mean of sample data

$\mu$ = (mu) Mean of population

$\sigma$ = (sigma) Standard Deviation of population

$s$ = Stnadar Deviation of sample population

$s^2$ = Variance of sample data

$\sigma^2$ = Variance of Population

$R$ = Range of data

$\bar{R}$ = Average range of Data

$k$ = Multi-purpose notation

<,> = less than , greater than

$\ge$ = Greater than equal to

$\le$ = Less than equal to

$\alpha$ = type I error rate

$\beta$ = Regression Co-efficient of population

$\theta$ = Gerenal Popultion parameter

Chapter 9 - Markdown cheet sheet¶

1-Headings¶

How to give headings in markdown files

Heading 1¶

Heading 2¶

Heading 3¶

Heading 4¶

2-Block of Words/citation¶

This is a normal text in markdown

This is a block of special text\ and the block is goin on as i have place a fwd slash

3-Line Break¶

This is 40 days long corse of Data science with python also know as python ka chilla with baba ammar

this is a second line

For line braeak we can us double enter or fwd slash

4-Combine two thing¶

Block of words and Heading

Heading 2¶

5-Face of text¶

Bold

italic

bold and italic

also we can use underscore (_) with (*) to do the same

bold and italic

6-Bullet points or Lists¶
  • day1
  • day2
  • day 3
  • day4
  • day5
    • day 5a
    • sublist (anything)
  • day 6
  • day 7

also we also can use * and + to make list

  • day1
  • day 3
  • day4
  • day5
  • day7

or if we want numbering

  1. day1
  2. day2
  3. day3
  4. day4
  5. day5
    1. day 5a

7= Line/Page Break¶

This is one page.



This is page two



8- Link and Hyperlinks¶

<> paste the link in between this to make is a hyperlink https://www.youtube.com/watch?v=qJqAXjz-Rh4

To watch the video of Mark_Down file is here

Link for python ka chilla is here

Making a key

Codanics

Codanics will teach you the whole subject of data science

the whole corse is here

9- Images and figures with link¶

to join this corse please scan the following QR code and join telegram group

QR

Picture

10-Adding code in code block¶

TO print a string print("codanic")

to print a code in line print("Hello")

to print a code in seprate block

this code will show systax according to python

x= 5+6
y=8+3
z= x+y
print(z)
`

this code will show systax according to R

x= 5+6
y=8+3
z= x+y
print(z)
`
11-Adding Table¶
species petal_length Sepal_length
virgenica 18.2 19.2
setosa 20 17.2
setosa 20 17.2
setosa 20 17.2
12-Table of content¶
  • 1-Headings
  • Heading 1
    • Heading 2
      • Heading 3
        • Heading 4
  • 2-Block of Words/citation
  • 3-Line Break
  • 4-Combine two thing
  • 5-Face of text
  • 6-Bullet points or Lists
  • 7= Line/Page Break
  • 8- Link and Hyperlinks
  • 9- Images and figures with link
  • 10-Adding code in code block
  • 14-How to change colour
  • 15-Adding a equation in MD
13- Install Extensions¶

Sample text

bold

italic and bold

Link

Image\ Image

image

here we can write our code

hello with citation

Column A Column B Column C
A1 B1 C1
A2 B2 C2
A3 B3 C3

hello Column A | Column B | Column C ---------|----------|--------- A1 | B1 | C1 A2 | B2 | C2 A3 | B3 | C3

14-How to change colour¶

Example:

THis text is normal\ This text is red owe can even give a hex code to change the color

15-Adding a equation in MD¶

In-line math

$this_{2}^{3}$

or

Math block

$$ \int0^\infty \ frac{x^3}{e^x-1}\,dx= \frac{\pi^4}{15} $$

for mmore information we can watch:[MathJax]

Chapter 10- Machine Learning¶

Data Driven Decesion Making¶

Types of Machine learning¶

  1. Supervised
  2. Un-Supervised (Clustring)
  3. Semi-Supervised
  4. Reinforcement

1- Supervised¶

  • Works under supervision
  • Teacher teaches
  • Prediction
  • Outcome

Input data ==> Training models and learning patterns of data ==> Prediction ==> Output and Reports

Types¶

  • Classification
    • For catagories
  • Regression
    • For numerical data

Supervised learning algorithms¶

  • Logistic Regression
    • Logistic regression is a supervised learning classification algorithm used to predict the probability of a target variable.
  • K- Nearest Neighbours (K-NN)
    • The k-nearest neighbors (KNN) algorithm is a supervised machine learning algorithm that can be used to solve both classification and regression problems. It's easy to implement and understand, but has a major drawback of becoming significantly slows as the size of that data in use grows.
  • Support Vector Machines (SVM)
    • Support Vector Machine” (SVM) is a supervised machine learning algorithm that can be used for both classification or regression challenges.
  • Kernel SVM
    • SVM algorithms use a set of mathematical functions that are defined as the kernel. The function of kernel is to take data as input and transform it into the required form. Different SVM algorithms use different types of kernel functions. These functions can be different types. For example linear, nonlinear, polynomial, radial basis function (RBF), and sigmoid.
  • Naive Bayes
    • Naïve Bayes Classifier is effective Classification algorithms which helps in building the fast machine learning models that can make quick predictions. It is a probabilistic classifier, which means it predicts on the basis of the probability of an object.
  • Decision Tree Clasification
    • Decision Trees are a type of Supervised Machine Learning (that is you explain what the input is and what the corresponding output is in the training data) where the data is continuously split according to a certain parameter. The tree can be explained by two entities, namely decision nodes and leaves.
  • Random Forest Classification
    • Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset and takes the average to improve the predictive accuracy of that dataset.

2- Un-Supervised (Clustring)¶

  • No supervision
  • No Teacher
  • Self learning
  • No labelling of data
  • Find patterns by itself

Training models and learning patterns of data by making different clusters ==> Prediction ==> Output and Reports

Supervised learning algorithms¶

  • K means clustering
    • Kmeans algorithm is an iterative algorithm that tries to partition the dataset into Kpre-defined distinct non-overlapping subgroups (clusters) where each data point belongs to only one group.The less variation we have within clusters, the more homogeneous (similar) the data points are within the same cluster.
  • Hierarchical clustering
    • Hierarchical clustering, also known as hierarchical cluster analysis, is an algorithm that groups similar objects into groups called clusters. The endpoint is a set of clusters, where each cluster is distinct from each other cluster, and the objects within each cluster are broadly similar to each other.
  • Probabilistic clustering
    • In probabilistic clustering the assignment of points to clusters is “soft”, in the sense that the membership of a data point x in a cluster Ck is given as a probability, denoted by pk(x). These are subjective probabilities, indicating strength of belief in the event in question.

3- Semi-Supervised¶

  • Mixture of 1 and 2
  • Some data is labelled but most is not

Input data ==> Training models and learning patterns of data ==> Prediction ==> Output and Reports

4- Reinforcement¶

  • Hit and trial method
  • Learn from mistakes
  • Reward and punishment rule
  • Prediction based on reward and punishment
  • Depends on feedback

Input data ==> Training models and learning patterns of data [ WELL-DONE] ==> Prediction ==> Output and Reports

Reinforcement learning algorithms¶

  1. Model-free Reinforcement learning
    • In reinforcement learning (RL), a model-free algorithm (as opposed to a model-based one) is an algorithm which does not use the transition probability distribution (and the reward function) associated with the Markov decision process (MDP), which, in RL, represents the problem to be solved.
      • Policy Optimization
      • Q-Learning
  1. Model-Based Reinforcement learning
    • Model-based Reinforcement Learning refers to learning optimal behavior indirectly by learning a model of the environment by taking actions and observing the outcomes that include the next state and the immediate reward.
      • Learn the model
      • Given the model

Supervised Machine Learning¶

Regression¶

Simple linear regression¶

Step 1- Import dataset¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('salary_data.csv')
df.head()
Out[ ]:
YearsExperience Salary
0 1.1 39343
1 1.3 46205
2 1.5 37731
3 2.0 43525
4 2.2 39891

Step 2- Splitting the dataset into the Training data and Testing data¶

In [ ]:
X = df[["YearsExperience"]]
y = df["Salary"]
X.head()
Out[ ]:
YearsExperience
0 1.1
1 1.3
2 1.5
3 2.0
4 2.2
In [ ]:
y.head()
Out[ ]:
0    39343
1    46205
2    37731
3    43525
4    39891
Name: Salary, dtype: int64
In [ ]:
#import library 
from sklearn.model_selection import train_test_split 
# slip the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Step 3- Fitting Linear Regression Model¶

In [ ]:
# use least square method to fit the line 
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
model = LinearRegression().fit(X_train, y_train)
model
Out[ ]:
LinearRegression()

Step 4- Plotting¶

In [ ]:
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train))
#plt.plot() ==> plot kro ,  plt.plot(X_train)==> X_train me , plt.plot(X_train)
# model.predict(X_train)) model ko pridict kro X_train se 
Out[ ]:
[<matplotlib.lines.Line2D at 0x1dd06e01ab0>]
In [ ]:
# Adding Colour 
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train) ,color = "Green")
Out[ ]:
[<matplotlib.lines.Line2D at 0x1dd06e84310>]
In [ ]:
#Adding labels
plt.scatter (X_train, y_train)
plt.plot(X_train, model.predict(X_train) ,color = "Green")
plt.xlabel("Tajurba")
plt.ylabel("Tankhwah")
plt.title("Train Plot")
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
# Pridicting testing data 
plt.scatter (X_test, y_test)
plt.plot(X_train, model.predict(X_train) ,color = "red")
plt.xlabel("Tajurba")
plt.ylabel("Tankhwah")
plt.title("Test Plot")
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>

Step 5 - Testing or Evaluating your model¶

In [ ]:
# Model Fitness
# Jo library import ki hui hai usi k andr hi hum ne test score/regression score nikalna hai 
print("Score for testing data =", model.score(X_test, y_test))
print("Score for training data =", model.score(X_train, y_train))
Score for testing data = 0.988169515729126
Score for training data = 0.9411949620562126
In [ ]:
model.score(X_test, y_test)
Out[ ]:
0.988169515729126
In [ ]:
model.score(X_train, y_train)
Out[ ]:
0.9411949620562126

Step6 - Prediction of unknown values¶

In [ ]:
model.predict([[5]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([73342.97478427])
In [ ]:
model.predict(X_test)
Out[ ]:
array([ 40748.96184072, 122699.62295594,  64961.65717022,  63099.14214487,
       115249.56285456, 107799.50275317])
In [ ]:
model.predict([[5],[6],[10]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([ 73342.97478427,  82655.549911  , 119905.85041792])
In [ ]:
a = ([[10], [20], [30], [5]])
In [ ]:
model.predict(a)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([119905.85041792, 213031.60168521, 306157.3529525 ,  73342.97478427])
In [ ]:
y_pred = model.predict(X_test)
y_pred
Out[ ]:
array([ 40748.96184072, 122699.62295594,  64961.65717022,  63099.14214487,
       115249.56285456, 107799.50275317])
In [ ]:
x_new = np.linspace(0, 30, 100)
y_new = model.predict(x_new[:, np.newaxis])
y_new
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([ 26780.09915063,  29602.09161327,  32424.08407592,  35246.07653856,
        38068.06900121,  40890.06146385,  43712.0539265 ,  46534.04638914,
        49356.03885179,  52178.03131444,  55000.02377708,  57822.01623973,
        60644.00870237,  63466.00116502,  66287.99362766,  69109.98609031,
        71931.97855295,  74753.9710156 ,  77575.96347824,  80397.95594089,
        83219.94840353,  86041.94086618,  88863.93332882,  91685.92579147,
        94507.91825411,  97329.91071676, 100151.9031794 , 102973.89564205,
       105795.88810469, 108617.88056734, 111439.87302998, 114261.86549263,
       117083.85795527, 119905.85041792, 122727.84288057, 125549.83534321,
       128371.82780586, 131193.8202685 , 134015.81273115, 136837.80519379,
       139659.79765644, 142481.79011908, 145303.78258173, 148125.77504437,
       150947.76750702, 153769.75996966, 156591.75243231, 159413.74489495,
       162235.7373576 , 165057.72982024, 167879.72228289, 170701.71474553,
       173523.70720818, 176345.69967082, 179167.69213347, 181989.68459611,
       184811.67705876, 187633.66952141, 190455.66198405, 193277.6544467 ,
       196099.64690934, 198921.63937199, 201743.63183463, 204565.62429728,
       207387.61675992, 210209.60922257, 213031.60168521, 215853.59414786,
       218675.5866105 , 221497.57907315, 224319.57153579, 227141.56399844,
       229963.55646108, 232785.54892373, 235607.54138637, 238429.53384902,
       241251.52631166, 244073.51877431, 246895.51123695, 249717.5036996 ,
       252539.49616224, 255361.48862489, 258183.48108754, 261005.47355018,
       263827.46601283, 266649.45847547, 269471.45093812, 272293.44340076,
       275115.43586341, 277937.42832605, 280759.4207887 , 283581.41325134,
       286403.40571399, 289225.39817663, 292047.39063928, 294869.38310192,
       297691.37556457, 300513.36802721, 303335.36048986, 306157.3529525 ])
In [ ]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
Out[ ]:
0.988169515729126

Model Evaluation¶

  • Mean absolute error
  • Mean squared error
  • Root mean squared error
In [ ]:
from sklearn import metrics
print (metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# RAE
# RSE
2446.1723690465055
12823412.298126549
3580.979237321343

Health Insurance Cost by Linear Regression¶

In this project we want to know the cost of health insurance of a person and will see how it may effect with the other factors

Step 1- Importing Libraries¶

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Step 2- Importing the data set¶

In [ ]:
ins= pd.read_csv("insurance.csv")

Step 3- Quick look at data¶

In [ ]:
#looking at first 5 rows of data set 
ins.head(5)
Out[ ]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [ ]:
#listing the column names 
ins.columns
Out[ ]:
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
In [ ]:
#Checking total no. of rows and coloumns
ins.shape
Out[ ]:
(1338, 7)
In [ ]:
#overall info of data set , which type of data we have 
ins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB

Here we have 3 catagorical features and 4 Numerical features

  • Catagorical features
    • sex
    • smoker
    • region
  • Numerical features
    • age
    • bmi
    • children
    • charges

Step 4- Data Visualization and Wrangling¶

Checking if there is any null value¶
In [ ]:
ins.isnull().sum()
Out[ ]:
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
4.1 Numerical Features¶
  • 4.1.1 - Charges
In [ ]:
plt.figure(figsize=(10,6)) # Setting the size of required figure 
sns.distplot(ins['charges'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='charges', ylabel='Density'>
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(ins["charges"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='charges'>

Here we see in interquartile range median is slighlty skewed to left also we have many outlyers , we might need to deal with this data

  • 4.1.2 - Age
In [ ]:
plt.figure(figsize=(10,6))
sns.distplot(ins['age'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='age', ylabel='Density'>
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(ins["age"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='age'>

Here we see the median is almost at midd in interquartile range

  • 4.1.3- bmi
In [ ]:
plt.figure(figsize=(10,6))
sns.distplot(ins['bmi'])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<AxesSubplot:xlabel='bmi', ylabel='Density'>
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(ins["bmi"])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[ ]:
<AxesSubplot:xlabel='bmi'>

Here we see the median is almost at mid in interquartile range with some outlyers

4.2- Checking Co-relation between the numerical data¶
In [ ]:
fig = plt.figure(figsize=(10,6))
sns.heatmap(ins.corr())
Out[ ]:
<AxesSubplot:>
In [ ]:
# with values
fig = plt.figure(figsize=(10,6))
sns.heatmap(ins.corr(), annot=True)
Out[ ]:
<AxesSubplot:>

Here we see that charges are in relation with

  • bmi
  • age
In [ ]:
sns.lineplot(x= "age", y= "charges" , data = ins )
Out[ ]:
<AxesSubplot:xlabel='age', ylabel='charges'>
In [ ]:
sns.lineplot(x= "bmi", y= "charges" , data = ins )
Out[ ]:
<AxesSubplot:xlabel='bmi', ylabel='charges'>
In [ ]:
sns.scatterplot(x='bmi', y='charges', data=ins )
Out[ ]:
<AxesSubplot:xlabel='bmi', ylabel='charges'>
In [ ]:
sns.jointplot(x='bmi', y='charges', data=ins ,  kind='hex')
Out[ ]:
<seaborn.axisgrid.JointGrid at 0x1dd0a4ec5e0>

4.3 Catagorical features¶

now we will check how the charges variates with our catagorical features

Transforming our catagorical features¶

Now we will transform our catagorical values and after that will check the correlation

  • Transforming [Sex]
In [ ]:
ins["sex"].unique()
Out[ ]:
array(['female', 'male'], dtype=object)
In [ ]:
ins["sex"]= ins["sex"].replace("male", 1)
ins["sex"]= ins["sex"].replace("female", 0)
  • Transforming [Smoker]
In [ ]:
ins["smoker"].unique()
Out[ ]:
array(['yes', 'no'], dtype=object)
In [ ]:
ins["smoker"]= ins["smoker"].replace("yes", 1)
ins["smoker"]= ins["smoker"].replace("no", 0)
In [ ]:
ins["region"].unique()
Out[ ]:
array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)
  • Transforming [region]
In [ ]:
# creating dummies for region coloumn 
r_dummies = pd.get_dummies(ins["region"])
In [ ]:
# Drop column 
ins = ins.drop('region',axis = 1)
# Join 
ins = ins.join(r_dummies)
ins.head() 
Out[ ]:
age sex bmi children smoker charges northeast northwest southeast southwest
0 19 0 27.900 0 1 16884.92400 0 0 0 1
1 18 1 33.770 1 0 1725.55230 0 0 1 0
2 28 1 33.000 3 0 4449.46200 0 0 1 0
3 33 1 22.705 0 0 21984.47061 0 1 0 0
4 32 1 28.880 0 0 3866.85520 0 1 0 0
In [ ]:
# now checking the correlation 
fig = plt.figure(figsize=(12,6))
sns.heatmap(ins.corr(), annot=True)
Out[ ]:
<AxesSubplot:>

Here we see charges are in correaltion with

  • age
  • bmi
  • smoker

Step 5- Creating a Model¶

In [ ]:
X= ins[["age","bmi", "smoker"]]
y= ins["charges"]
In [ ]:
# importing Linear regression algorithm from sklearn library 
from sklearn.linear_model import LinearRegression
# fitting our X and y values in this algorithm 
model = LinearRegression().fit(X,y)
model
Out[ ]:
LinearRegression()

Step 6- Getting Prediction from Model¶

In [ ]:
# getting prediction in following way 
#model.predict([[age  ,bmi , smoker]])
model.predict([[19 ,27.9, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([26079.21861526])
In [ ]:
#Cheching model co-efficient and intercept 
print(model.coef_)
print(model.intercept_)
[  259.54749155   322.61513282 23823.68449531]
-11676.830425187778
In [ ]:
coeff_ins = pd.DataFrame(model.coef_,X.columns,columns=['Co-efficient'])
coeff_ins
Out[ ]:
Co-efficient
age 259.547492
bmi 322.615133
smoker 23823.684495

Step 7- Checking accuracy of the model.¶

TO check the accurary we first have to split the data ,some for training the model and some for testting it

Splitting the dataset into the Training data and Testing data¶
In [ ]:
#import library 
from sklearn.model_selection import train_test_split 
# splitting the data : 80% training data and 20% testing data  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [ ]:
# fitting the training data in linear regression again 
from sklearn.linear_model import LinearRegression
model1 = LinearRegression().fit(X_train, y_train)
model1
Out[ ]:
LinearRegression()
In [ ]:
# getting a prediction again 
model1.predict([[19 ,27.9, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([25916.43448944])
In [ ]:
# getting prediction for whole values we splitted for test 
y_pred = model1.predict(X_test)
In [ ]:
#importing library for test  
from sklearn.metrics import r2_score
# now testing between already know y_test values and y_pred values 
score = r2_score(y_test, y_pred)
score
Out[ ]:
0.7945500805653087

So according to this r2_score test our model is 79% accurate

Regression Evaluation Metrics¶
In [ ]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
MAE: 3960.8666198087158
MSE: 32693237.938726168
RMSE 5717.80009607945

Print insurance cost¶

In [ ]:
age = input('What is your age ? \n')
bmi = input('What is your bmi  ? \n')
smoker= input('press 1 if you are  a smoker and 0 if not \n')
try:
    print('We predict {:.0f}$ will be your insurance cost .'.format(
        model.predict([[float(age), float(bmi),int(smoker)]])[0]))
except ValueError:
    print('Please only input either:\n- whole number e.g. 1, 4, 7\n- decimal/float number e.g. 3.8')
We predict 74168$ will be your insurance cost .
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

Multiple Linear Regression¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# Importing the dataset
df = pd.read_csv('ml_data_salary.csv')
df.head()
Out[ ]:
age distance YearsExperience Salary
0 31.1 77.75 1.1 39343
1 31.3 78.25 1.3 46205
2 31.5 78.75 1.5 37731
3 32.0 80.00 2.0 43525
4 32.2 80.50 2.2 39891
In [ ]:
# Set independent and dependent variables
X = df[['distance', 'YearsExperience']]
y = df['Salary']
In [ ]:
# Initialize model from sklearn and fit it into our data
regr = linear_model.LinearRegression()
model = regr.fit(X, y)
model
Out[ ]:
LinearRegression()
In [ ]:
print('Intercept:', model.intercept_)
print('Coefficients:', model.coef_)
Intercept: -218603.37708034192
Coefficients: [3258.60769705 1303.44307882]
In [ ]:
# Values to predict
distance = input('How much distance you have to cover for job? \n')
YearsExperience = input('How many years of experience do you have ? \n')

try:
    print('We predict {:.0f}$ will be you salary if you have to cover {}m and have workin experience of {} years.'.format(
        model.predict([[float(distance), float(YearsExperience)]])[0],
        distance, 
        YearsExperience))
except ValueError:
    print('Please only input either:\n- whole number e.g. 1, 4, 7\n- decimal/float number e.g. 3.8')
We predict 37360$ will be you salary if you have to cover 77.75m and have workin experience of 2 years.
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
In [ ]:
X.shape
Out[ ]:
(30, 2)

Plotting¶

In [ ]:
# Prepare data
X = df[['distance', 'YearsExperience']].values.reshape(-1,2)
Y = df['Salary']
In [ ]:
print(df["distance"].max())
print(df["distance"].min())
101.25
77.75
In [ ]:
print(df["YearsExperience"].max())
print(df["YearsExperience"].min())
10.5
1.1
In [ ]:
# Create range for each dimension
x = X[:, 0]
y = X[:, 1]
z = Y
In [ ]:
xx_pred = np.linspace(77, 102, 30)  # range of distance values
yy_pred = np.linspace(1, 11, 30)  # range of YearsExperience values
xx_pred, yy_pred = np.meshgrid(xx_pred, yy_pred)
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T
In [ ]:
predicted = model.predict(model_viz)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
In [ ]:
# Evaluate model by using it's R^2 score 
r2 = model.score(X, Y)
r2
#r2 score is almost near to one so we are good to go 
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
0.9569566641435084
In [ ]:
# Plot model visualization
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12, 4))

ax1 = fig.add_subplot(131, projection='3d')
ax2 = fig.add_subplot(132, projection='3d')
ax3 = fig.add_subplot(133, projection='3d')

axes = [ax1, ax2, ax3]

for ax in axes:
    ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
    ax.scatter(xx_pred.flatten(), yy_pred.flatten(), predicted, facecolor=(0,0,0,0), s=20, edgecolor='#70b3f0', )
    ax.set_xlabel('distance', fontsize=12)
    ax.set_ylabel('YearsExperience', fontsize=12)
    ax.set_zlabel('Salary', fontsize=12)
    ax.locator_params(nbins=4, axis='x')
    ax.locator_params(nbins=5, axis='x')

    ax1.view_init(elev=25, azim=-60)
ax2.view_init(elev=15, azim=15)
ax3.view_init(elev=25, azim=60)

fig.suptitle('Multi-Linear Regression Model Visualization ($R^2 = %.2f$)' % r2, fontsize=16, color='k')

fig.tight_layout()
In [ ]:
#FOR accuracy 
#import library 
from sklearn.model_selection import train_test_split 
X = df[["distance", "YearsExperience","age"]]
y = df["Salary"]
# slip the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [ ]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
model = LinearRegression().fit(X_train, y_train)
model
Out[ ]:
LinearRegression()
In [ ]:
plt.scatter (X_train[["distance"]], y_train)
plt.scatter (X_train[["YearsExperience"]], y_train)
plt.scatter (X_train[["age"]], y_train)
plt.plot(X_train, model.predict(X_train), color = "Grey" )
plt.xlabel("YearsExperience| age | distance")
plt.ylabel("Salary")
plt.title("Train Plot ")
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
#R2 test 
print("Score for testing data =", model.score(X_test, y_test))
print("Score for training data =", model.score(X_train, y_train))
Score for testing data = 0.9880925772756097
Score for training data = 0.9411691490005899
In [ ]:
model.predict([[33 ,77.75, 1.1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[ ]:
array([-5.09866629e+17])
In [ ]:
y_pred =model.predict(X_test)
y_pred
Out[ ]:
array([ 40784., 122688.,  64992.,  63120., 115264., 107840.])

Model Evaluation¶

  • Mean absolute error
  • Mean squared error
  • Root mean squared error
In [ ]:
from sklearn import metrics
print (metrics.mean_absolute_error(y_test, y_pred))
2442.5
In [ ]:
print(metrics.mean_squared_error(y_test, y_pred))
12906808.166666666
In [ ]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
3592.604649368848

Random Forest Regression¶

Weather data for Seattle, WA from 2016¶

retrieved from NOAA Climate Data Online tool

In [ ]:
# Importing libraries
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df = pd.read_csv('temps.csv')
df.head(5)
Out[ ]:
year month day week temp_2 temp_1 average actual forecast_noaa forecast_acc forecast_under friend
0 2016 1 1 Fri 45 45 45.6 45 43 50 44 29
1 2016 1 2 Sat 44 45 45.7 44 41 50 44 61
2 2016 1 3 Sun 45 44 45.8 41 43 46 47 56
3 2016 1 4 Mon 44 41 45.9 40 44 48 46 53
4 2016 1 5 Tues 41 40 46.0 44 46 46 46 41
  • The information is in the tidy data format with each row forming one observation, with the variable values in the columns.

Following are explanations of the columns:

  • year: 2016 for all data points
  • month: number for month of the year
  • day: number for day of the year
  • week: day of the week as a character string
  • temp_2: max temperature 2 days prior
  • temp_1: max temperature 1 day prior
  • average: historical average max temperature
  • actual: max temperature measurement
  • friend: your friend’s prediction, a random number between 20 below the average and 20 above the ave
In [ ]:
#Checking the shape of our data that how many row and coloumns we hai 
print('The shape of our features is:', df.shape)
#so we have 348 rows and 12 coloumns in our data
The shape of our features is: (348, 12)
In [ ]:
# check if there is Nan values in here our data
df.isnull().sum()
#there is no Nan value in our data 
Out[ ]:
year              0
month             0
day               0
week              0
temp_2            0
temp_1            0
average           0
actual            0
forecast_noaa     0
forecast_acc      0
forecast_under    0
friend            0
dtype: int64
In [ ]:
# getting some information of our data 
df.info()
# here we have one catagorical value column . 
# As we are working in regression so we have to convert it into integer type by making the dummy data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            348 non-null    int64  
 1   month           348 non-null    int64  
 2   day             348 non-null    int64  
 3   week            348 non-null    object 
 4   temp_2          348 non-null    int64  
 5   temp_1          348 non-null    int64  
 6   average         348 non-null    float64
 7   actual          348 non-null    int64  
 8   forecast_noaa   348 non-null    int64  
 9   forecast_acc    348 non-null    int64  
 10  forecast_under  348 non-null    int64  
 11  friend          348 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 32.8+ KB
In [ ]:
# Converting the catagorical values data using pandas get_dummies
df = pd.get_dummies(df)
# Display the first 5 rows of the last 12 columns
df.iloc[:,5:].head(5)
Out[ ]:
average actual forecast_noaa forecast_acc forecast_under friend week_Fri week_Mon week_Sat week_Sun week_Thurs week_Tues week_Wed
0 45.6 45 43 50 44 29 1 0 0 0 0 0 0
1 45.7 44 41 50 44 61 0 0 1 0 0 0 0
2 45.8 41 43 46 47 56 0 0 0 1 0 0 0
3 45.9 40 44 48 46 53 0 1 0 0 0 0 0
4 46.0 44 46 46 46 41 0 0 0 0 0 1 0
In [ ]:
# again checking the shape of our data 
df.shape
Out[ ]:
(348, 18)
In [ ]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(df['actual'])

# Remove the labels from the features
# axis 1 refers to the columns
df= df.drop('actual',axis = 1)
df= df.drop("forecast_noaa",axis = 1)
#'forecast_under','friend', 'week_Fri''week_Mon','week_Sat','week_Sun','week_Thurs','week_Tues','week_Wed')
# Convert to numpy array
features = np.array(df)
In [ ]:
# Saving feature names for later use
df_list = list(df.columns)
df_list
Out[ ]:
['year',
 'month',
 'day',
 'temp_2',
 'temp_1',
 'average',
 'forecast_acc',
 'forecast_under',
 'friend',
 'week_Fri',
 'week_Mon',
 'week_Sat',
 'week_Sun',
 'week_Thurs',
 'week_Tues',
 'week_Wed']
In [ ]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
In [ ]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
Training Features Shape: (261, 16)
Training Labels Shape: (261,)
Testing Features Shape: (87, 16)
Testing Labels Shape: (87,)

Preparing our Model¶

In [ ]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)
Out[ ]:
RandomForestRegressor(n_estimators=1000, random_state=42)
In [ ]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
In [ ]:
#Checking score of testing and training data 
print("Score for testing data =", rf.score(test_features, test_labels))
print("Score for training data =", rf.score(train_features, train_labels))
Score for testing data = 0.8149088174655048
Score for training data = 0.9746126233418512
In [ ]:
# Checking the mean absolute error 
from sklearn import metrics
print (metrics.mean_absolute_error(test_labels, predictions))

# Long way to do which we have done with sklearn library
# # Calculate the absolute errors
# errors = abs(predictions - test_labels)
# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
3.859586206896551

Visualizing a Single Decision Tree¶

In [ ]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]
tree
Out[ ]:
DecisionTreeRegressor(max_features='auto', random_state=1201263687)
In [ ]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = df_list, rounded = True, precision = 1)
In [ ]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
In [ ]:
# Write graph to a png file
graph.write_png('tree.png')

As we see the graph is so big even to visualize so lets make a new model with low no. of trees

In [ ]:
# Limit depth of tree to 3 levels and 10 trees 
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
Out[ ]:
RandomForestRegressor(max_depth=3, n_estimators=10)
In [ ]:
# Extract the small tree
tree_small = rf_small.estimators_[5]
In [ ]:
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = df_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png')

Polynomial Regression¶

In [ ]:
# for checking a polynomial regression lets firstle check what is bad fit 
# bad fit 
import numpy as np
import matplotlib.pyplot as plt 

x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]

mymodel= np.poly1d(np.polyfit(x ,y ,3))
myline= np.linspace(2 , 95, 100)

plt.scatter(x,y)
plt.plot(myline, mymodel(myline))
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
# R Square test for bad fit
from sklearn.metrics import r2_score 

x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
model = np.poly1d(np.polyfit(x,y,3))

print(r2_score(y, model(x)))
0.009952707566680652
In [ ]:
# R Square test for bad fit
from sklearn.metrics import r2_score 

x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
model = np.poly1d(np.polyfit(x,y,3))

print(r2_score(y, model(x)))
0.009952707566680652
In [ ]:
# Step2 : Draw a line 
import numpy as np
import matplotlib.pyplot as plt 
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]

mymodel= np.poly1d(np.polyfit(x ,y ,3))
myline= np.linspace(1 , 22, 200)

plt.scatter(x,y)
plt.plot(myline, mymodel(myline))
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
# Step 3 : R-Squared 
import numpy as np
import matplotlib.pyplot as plt 
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]

model = np.poly1d(np.polyfit(x,y,3))

print(r2_score(y, model(x)))
0.9338713637130449
In [ ]:
# Prediction 

import numpy as np
import matplotlib.pyplot as plt 
x= [1,2,3,5,6,7,8,9,10,12,13,14,15,16,18,19,21,22]
y= [100,80,80,60,60,55,60,65,70,70,75,76,78,79,90,99,99,100]

model = np.poly1d(np.polyfit(x,y,3))

speed = model(10)
speed
Out[ ]:
62.60788989976389

Hands on Example¶

In [ ]:
# Importing Libraries 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 

# IMporting the dataset 
dataset= pd.read_csv("https://s3.us-west-2.amazonaws.com/public.gamelab.fun/dataset/position_salaries.csv")
dataset.head()
Out[ ]:
Position Level Salary
0 Business Analyst 1 45000
1 Junior Consultant 2 50000
2 Senior Consultant 3 60000
3 Manager 4 80000
4 Country Manager 5 110000
In [ ]:
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values
In [ ]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [ ]:
# Fitting Linear Regression to the dataset
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)

# Visualizing the Linear Regression results
def viz_linear():
    plt.scatter(X, y, color="red")
    plt.plot(X, lin_reg.predict(X), color="blue")
    plt.title("Truth or Bluff (Linear Regression)")
    plt.xlabel("Position level")
    plt.ylabel("Salary")
    plt.show()
    return
viz_linear()
In [ ]:
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, y)
Out[ ]:
LinearRegression()
In [ ]:
# Visualizing the Polymonial Regression results
def viz_polymonial():
    plt.scatter(X, y, color="red")
    plt.plot(X, pol_reg.predict(poly_reg.fit_transform(X)), color="blue")
    plt.title("Truth or Bluff (Linear Regression)")
    plt.xlabel("Position level")
    plt.ylabel("Salary")
    plt.show()
    return
viz_polymonial()
In [ ]:
# Predicting a new result with Polymonial Regression
Pred_linear = lin_reg.predict([[11]])
In [ ]:
# Predicting a new result with Polymonial Regression
Pred_ploynomial = pol_reg.predict(poly_reg.fit_transform([[11]]))
In [ ]:
print('Linear Regression Results =', Pred_linear)
print('Polynomial Regression Results =' ,Pred_ploynomial )
print("The differene is ", Pred_ploynomial - Pred_linear)
Linear Regression Results = [694333.33333333]
Polynomial Regression Results = [1780833.33333359]
The differene is  [1086500.00000025]

Classification Type in Superviesed Machine Learning¶

Logistic Regression¶

In [ ]:
#import libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
In [ ]:
#import online data 
from sklearn.datasets import load_digits
In [ ]:
digits= load_digits()
In [ ]:
# input variables  or features (X)
digits.data.shape
X = digits.data
# means 1797 pictures size is 64 (8x8)
In [ ]:
# output labels (y)
digits.target.shape
y= digits.target
In [ ]:
plt.gray()
plt.matshow(digits.images[520])
Out[ ]:
<matplotlib.image.AxesImage at 0x1ca6e21fd60>
<Figure size 432x288 with 0 Axes>
In [ ]:
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:5], digits.target[0:5])):
    plt.subplot(1,5, index+1)
    #using image(8,8) because we've seen the picsize of 64 i.e 8x8
    plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
    plt.title("Training : %i\n" % label, fontsize = 20)
    
In [ ]:
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:10], digits.target[0:10])):
    plt.subplot(1,10, index+1)
    #using image(8,8) because we've seen the picsize of 64 i.e 8x8
    plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
    plt.title("Training : %i\n" % label, fontsize = 20)
In [ ]:
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digits.data[0:10], digits.target[0:10])):
    plt.subplot(1,10, index+1)
    #using image(8,8) because we've seen the picsize of 64 i.e 8x8
    plt.imshow(np.reshape(image,(8,8)), cmap= plt.cm.gray)
    plt.title(label, fontsize = 20)
In [ ]:
#help(plt)
In [ ]:
#split the data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split (X, y, test_size=0.25  ,random_state= 0)
In [ ]:
print("Train input data : " , X_train.shape)
print("Test input data : " , X_test.shape)
print("Train output data : " , y_train.shape)
print("Test output data : " , y_test.shape)
Train input data :  (1347, 64)
Test input data :  (450, 64)
Train output data :  (1347,)
Test output data :  (450,)
In [ ]:
# training a model 
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)
model
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[ ]:
LogisticRegression()
In [ ]:
print(X_test[0:10].shape)
(10, 64)
In [ ]:
# prediction 
model.predict(X_test[0:10])
Out[ ]:
array([2, 8, 2, 6, 6, 7, 1, 9, 8, 5])
In [ ]:
y_pred = model.predict(X_test)
y_pred.shape
Out[ ]:
(450,)
In [ ]:
# Accuracy score 
score= model.score(X_test, y_test)
score 
# yha pe X_test ko as an input leta hai pr X_test ki prediction ko , y_test se compare kr k k score bta rha 
Out[ ]:
0.9511111111111111
In [ ]:
# Confusion matrix 
from sklearn import metrics
cm= metrics.confusion_matrix(y_test, y_pred)
cm
Out[ ]:
array([[37,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 40,  0,  0,  0,  0,  0,  0,  2,  1],
       [ 0,  1, 40,  3,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 43,  0,  0,  0,  0,  1,  1],
       [ 0,  0,  0,  0, 37,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0, 46,  0,  0,  0,  2],
       [ 0,  1,  0,  0,  0,  0, 51,  0,  0,  0],
       [ 0,  0,  0,  1,  1,  0,  0, 46,  0,  0],
       [ 0,  3,  1,  0,  0,  0,  0,  0, 43,  1],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  1, 45]], dtype=int64)

Confusion matrix through Seaborn¶

In [ ]:
#plotting a confusion metrix 
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=0.5 , square= True, cmap= "Spectral")
plt.ylabel("Actual Output")
plt.xlabel("Predicted Output")
title = "Accuracy Score : {0}".format(score)
plt.title(title, size= 15)
Out[ ]:
Text(0.5, 1.0, 'Accuracy Score : 0.9511111111111111')

Confusion matrix through Matplotlib¶

In [ ]:
plt.figure(figsize=(9,9))
plt.imshow(cm, interpolation='nearest', cmap='Pastel1')
plt.title('Confusion matrix', size = 15)
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], rotation=45, size = 10)
plt.yticks(tick_marks, ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], size = 10)
plt.tight_layout()
plt.ylabel('Actual label', size = 15)
plt.xlabel('Predicted label', size = 15)
width, height = cm.shape
for x in range (width):
 for y in range (height):
  plt.annotate(str(cm[x][y]), xy=(y, x), 
  horizontalalignment='center',
  verticalalignment='center')

Display Misclassified images with Predicted Labels¶

In [ ]:
# Getting a mis classified lables 
index= 0
misclassifiedIndex = []
for label, predict in zip(y_pred , y_test):
    if predict != label:
        misclassifiedIndex.append(index)
        print(label,predict,index)
    index +=1
    
9 5 56
4 7 94
1 6 118
1 8 124
5 9 130
9 8 169
9 5 181
1 8 196
8 1 213
3 2 235
3 2 251
7 4 315
9 1 325
8 1 331
2 8 335
8 3 378
1 8 398
1 2 415
3 7 425
8 9 429
9 3 430
3 2 440
In [ ]:
#error is this 
# # Getting a mis classified lables 
# index= 0
# misclassifiedIndex = []
# for label, predict in zip(y_pred , y_test):
#     if predict != label:
#         misclassifiedIndex.append(index)
#         print(label,predict,index)
#         index +=1
    
In [ ]:
# plotting missclassified label with know 
plt.figure(figsize= (20,5))
for plotIndex, badIndex in enumerate ( misclassifiedIndex[0:5]):
    plt.subplot(1, 5, plotIndex + 1)
    plt.imshow (np.reshape(X_test[badIndex],(8,8)), cmap= plt.cm.gray)
    plt.title("Predicted :{}, Actual : {}".format (y_pred[badIndex], y_test[badIndex]), fontsize=18)

K- Nearest Neighbour¶

Qareebi rishtedaro wala algorithm

K= No of neighbours

  • K should not be lower ==> Noise
  • K should not be higher ==> Out of sample accuracy decreases

Predict the response value based on the neighbour which is nearest and more in numbers

  • Minkoeski distance

Can also be usde for the numerical data / regression data

K-nearest neighbour accurary measurement¶

Important

  • Jaccard Index
  • F1- score
  • Log loss
  • many more
    • Classification Accuracy
    • Confusion Matrix
    • Area under Curve
    • Mean ABsolute Error
    • Mean squared Error

Pros:

  • Training phase is faster
  • instance based learning algorithm
  • can be used with non-linear data ( it does not mean no parametric data )

Cons :

  • Testing phase is slower
  • Costly for memory and computation
  • Not suitable for large dimensions

How to improve :

  • Data wrangling and scaling
  • Missing value
  • Normalization on some scale for everything (-1-0-1)
  • Reduce dimensions to improve performance
In [ ]:
import pandas as pd
df = pd.read_csv("mldata.csv")
df["gender"]= df["gender"].replace("Male", 1)
df["gender"]= df["gender"].replace("Female", 0)
In [ ]:
X= df[["weight","gender"]]
y= df["likeness"]
In [ ]:
#machine learning algorithm 
from sklearn.neighbors import KNeighborsClassifier

# create and fit our model 
model = KNeighborsClassifier(n_neighbors= 5).fit(X,y)
model
Out[ ]:
KNeighborsClassifier()
In [ ]:
#prediction 
model.predict([[70, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
  warnings.warn(
Out[ ]:
array(['Biryani'], dtype=object)
In [ ]:
model.predict(X)
Out[ ]:
array(['Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Samosa', 'Samosa', 'Biryani', 'Biryani',
       'Samosa', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Samosa',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Samosa',
       'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani', 'Pakora',
       'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Biryani', 'Samosa',
       'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Pakora',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani',
       'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Pakora', 'Pakora',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Samosa', 'Pakora', 'Samosa', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
       'Biryani', 'Samosa', 'Biryani', 'Pakora', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Pakora', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
       'Biryani', 'Biryani', 'Samosa', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Samosa', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani'],
      dtype=object)
In [ ]:
# split data into test and train (80/20 rule)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 ,random_state= 0)

# create a model 
model = KNeighborsClassifier(n_neighbors=12)
# fittine a model 
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)
predicted_values
Out[ ]:
array(['Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani', 'Biryani',
       'Biryani'], dtype=object)
In [ ]:
#checkin score 
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, predicted_values)
score
Out[ ]:
0.6530612244897959
In [ ]:
from sklearn.metrics import f1_score
score = f1_score(y_test, predicted_values,average='weighted')
score
Out[ ]:
0.5159989921894684
In [ ]:
from sklearn.metrics import precision_score
score = precision_score(y_test, predicted_values ,average='macro')
score
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Out[ ]:
0.21768707482993196
In [ ]:
from sklearn.metrics import recall_score
score = recall_score(y_test, predicted_values,average='micro')
score
Out[ ]:
0.6530612244897959

Decision Tree Classification¶

Decision Trees (DTs) are a non-parametric(in which data is not normalized) supervised learning method used for classification and regression.

  • if-then-else decision rules

Pros¶

  1. Easy to use and interpret
  2. Can be visualized
  3. Little data prepration (NOn-parametric)
  4. Anle to handle both numeical and catagorical data
  5. Multi-output problems can be handled
  6. Use boolean logic to model the predictions
  7. Performs well (Some of the assumptions)

Cons¶

  1. Complex trees for beginners
  2. Unstable
  3. No smooth or continuous predictions
  4. Complex steps (XOR, parity or multiplexer problems)
  5. Decision tree learners create biased trees if some classes dominate
In [ ]:
# Descion tree classifier me hum do ya do se ziada parameters ko lene k baad hum class define krte hain
import pandas as pd
df = pd.read_csv("mldata.csv
In [ ]:
df.head()
Out[ ]:
age height weight gender likeness
0 27 170.688 76.0 Male Biryani
1 41 165.000 70.0 Male Biryani
2 29 171.000 80.0 Male Biryani
3 27 173.000 102.0 Male Biryani
4 29 164.000 67.0 Male Biryani
In [ ]:
#catagorical value ko pehle change krna hota 
df["gender"]= df["gender"].replace("Male", 1)
df["gender"]= df["gender"].replace("Female", 0)
df.tail()
Out[ ]:
age height weight gender likeness
240 31 160.0 60.0 1 Pakora
241 26 172.0 70.0 1 Biryani
242 40 178.0 80.0 1 Biryani
243 25 5.7 65.0 1 Biryani
244 33 157.0 56.0 0 Samosa
In [ ]:
# catagorical variable ko define krne k liey decision tree classifier ka use krna parta hai 
In [ ]:
# selection of input and output variables
X= df[["weight","gender"]]
y= df["likeness"]
In [ ]:
X.head()
Out[ ]:
weight gender
0 76.0 1
1 70.0 1
2 80.0 1
3 102.0 1
4 67.0 1
In [ ]:
y.head()
Out[ ]:
0    Biryani
1    Biryani
2    Biryani
3    Biryani
4    Biryani
Name: likeness, dtype: object
In [ ]:
#machine learning algorithm 
from sklearn.tree import DecisionTreeClassifier

# create and fit our model 
model = DecisionTreeClassifier().fit(X,y)
In [ ]:
#prediction 
model.predict([[80, 1]])
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
Out[ ]:
array(['Biryani'], dtype=object)
In [ ]:
model.predict([[23, 1]])[0]
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
  warnings.warn(
Out[ ]:
'Biryani'
In [ ]:
# how to measure the accurary of our model 
# slit data into test and train (80/20 rule)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2 , random_state= 0)

# create a model 
model = DecisionTreeClassifier()
# fittine a model 
model.fit(X_train, y_train)

predicted_values = model.predict(X_test)
predicted_values
#checkin score 
score = accuracy_score(y_test, predicted_values)
score
Out[ ]:
0.6122448979591837
In [ ]:
import seaborn as sns 
import matplotlib.pyplot as plt 

sns.boxplot(x= X_train["weight"],hue= X_train["gender"] ,y= y_train, data= df)
Out[ ]:
<AxesSubplot:xlabel='weight', ylabel='likeness'>
In [ ]:
# HOw to train and save your model 
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
import joblib

model = DecisionTreeClassifier().fit(X,y)

joblib.dump(model, "foodie.joblib")
Out[ ]:
['foodie.joblib']
In [ ]:
#graph 
from sklearn import tree
model = DecisionTreeClassifier().fit(X,y)
In [ ]:
#graph 
from sklearn import tree
model = DecisionTreeClassifier().fit(X,y)
# model.fit(X, y)
# graphic evaluation / look into what happened 
tree.export_graphviz(model,
                     out_file="foodie.dot",
                     feature_names=["age", "gender"],
                     class_names = sorted( y.unique()),
                     label= "all",
                     rounded= True,
                     filled =True) 

Second Example¶

In [ ]:
#Load data set 
import pandas as pd 
import seaborn as sns 
import numpy as np

df1= sns.load_dataset("iris")
df1.head()
Out[ ]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [ ]:
import matplotlib.pyplot as plt 
from sklearn.tree import DecisionTreeClassifier
# df1 mtlb ye wala data ..Iloc mtb kis type ka data chahiey us me se 
#then square bracket me pehle row ati and : ka mtlb k sari rows aa gai 
# then comma , and then column ate ..hum ne kha ki : mtlb sare column aa jey -1 last wale k ilawa 
X= df1.iloc[: , :-1] 
# and yhe pe -1 likha k sirf last wala column : mtlb baki sare column chor k 
y= df1.iloc[:, -1:]
# agr hum sif -1 likhe ge to wo us column ki values show kr de ga ..but humko us column ka index b chahiey 
In [ ]:
X.head()
Out[ ]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [ ]:
y.head()
Out[ ]:
species
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
In [ ]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

model = DecisionTreeClassifier().fit(X,y)
model 
Out[ ]:
DecisionTreeClassifier()
In [ ]:
plot_tree(model , filled = True)
plt.title("Decision tree trained model of IRIS data")

# FOllowing codes are to sav
# #saving high resolution png 
# plt.savefig('saving-a-high-resolution-plot.png', dpi=300)
# #saving high resolution png with transparent
# plt.savefig('saving-a-plot-as-png-file-transparent.png',dpi=400, transparent=True)
# #saving a pdf file a with high resolution 
# plt.savefig('saving-a-plot-in-pdf.pdf', dpi=400)
# #saving a tiff file with high resolution 
# plt.savefig('saving-a-plot-as-tiff-file.tiff', dpi=500)

plt.show()

Random Forest Classifier¶

In [ ]:
# load libraires adn data set
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

df= sns.load_dataset("iris")
df.head()
Out[ ]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [ ]:
X= df.iloc[: ,:-1]
y= df.iloc[:, -1:]
In [ ]:
from sklearn.ensemble import RandomForestClassifier
#n_estimators default 100 
# The Number of trees in forest 
model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
model.predict([[5,4,2,6]])
C:\Users\Epazz\AppData\Local\Temp/ipykernel_10720/889084137.py:5: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  model.fit(X,y)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but RandomForestClassifier was fitted with feature names
  warnings.warn(
Out[ ]:
array(['setosa'], dtype=object)
In [ ]:
# IMporting library 
from sklearn.model_selection import train_test_split
# splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
y_pred= model.predict(X_test)
y_pred
Out[ ]:
array(['setosa', 'virginica', 'versicolor', 'setosa', 'setosa', 'setosa',
       'versicolor', 'setosa', 'virginica', 'versicolor', 'virginica',
       'virginica', 'virginica', 'setosa', 'virginica', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'setosa', 'virginica',
       'virginica', 'virginica', 'setosa', 'versicolor', 'setosa',
       'setosa', 'versicolor'], dtype=object)
In [ ]:
# Accuracy test 
score = model.score(X_test, y_test)
score
Out[ ]:
1.0
In [ ]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
cm
Out[ ]:
array([[13,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 12]], dtype=int64)
In [ ]:
#plotting a confusion metrix 
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=0.5 , square= True, cmap= "Spectral")
plt.ylabel("Actual Output")
plt.xlabel("Predicted Output")
title = "Accuracy Score : {0}".format(score)
plt.title(title, size= 15)
Out[ ]:
Text(0.5, 1.0, 'Accuracy Score : 1.0')

Naive Bayes Classifier¶

There are three types of Naive Bayes model under the scikit-learn library:

  • Gaussian
  • Multinomial
  • Bernoulli

Gaussian: It is used in classification and it assumes that features follow a normal distribution.\ When the values of predictors are continuous in nature and it is assumed that they follow Gaussian distribution.

Multinomial: It is used for discrete counts.\ when the predictors are boolean in nature and it is assumed they follow Bernoulli distribution.\ For example, let’s say, we have a text classification problem. Here we can consider Bernoulli trials which is one step further and instead of “word occurring in the document”, we have “count how often word occurs in the document”, you can think of it as “number of times outcome number x_i is observed over the n trials”.

Bernoulli: The binomial model is useful if your feature vectors are binary (i.e. zeros and ones). One application would be text classification with ‘bag of words’ model where the 1s & 0s are “word occurs in the document” and “word does not occur in the document” respectively.\ mostly used for document or text classification problems

In [ ]:
# load libraires adn data set
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
In [ ]:
# Load data set from Seaborn library 
df= sns.load_dataset("iris")
df.head()
Out[ ]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [ ]:
#Defining features and lables at X and y respectively 
X= df.iloc[: ,:-1]
y= df.iloc[:, -1:]
In [ ]:
# Checking our variables 
print(X.head())
print(y.head())
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2
  species
0  setosa
1  setosa
2  setosa
3  setosa
4  setosa
In [ ]:
# splitting X and y into training and testing sets at 80/20 rule 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
In [ ]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
C:\Users\Epazz\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[ ]:
GaussianNB()
In [ ]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)
In [ ]:
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)
Gaussian Naive Bayes model accuracy(in %): 96.66666666666667

Support Vector Machine¶

In [ ]:
#Import scikit-learn dataset library
from sklearn import datasets
#Load dataset
cancer = datasets.load_breast_cancer()
In [ ]:
type(cancer)
Out[ ]:
sklearn.utils.Bunch
In [ ]:
# print the names of the 30 features
print("Features: ", cancer.feature_names)
# print the label type of cancer('malignant' "benign')
print("Labels: ", cancer.target_names)
Features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels:  ['malignant' 'benign']
In [ ]:
# Print data (features) shape 
cancer.data.shape
Out[ ]:
(569, 30)
In [ ]:
#print the cancer data features (top 5 records)
print(cancer.data[0:5])
[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]
 [1.142e+01 2.038e+01 7.758e+01 3.861e+02 1.425e-01 2.839e-01 2.414e-01
  1.052e-01 2.597e-01 9.744e-02 4.956e-01 1.156e+00 3.445e+00 2.723e+01
  9.110e-03 7.458e-02 5.661e-02 1.867e-02 5.963e-02 9.208e-03 1.491e+01
  2.650e+01 9.887e+01 5.677e+02 2.098e-01 8.663e-01 6.869e-01 2.575e-01
  6.638e-01 1.730e-01]
 [2.029e+01 1.434e+01 1.351e+02 1.297e+03 1.003e-01 1.328e-01 1.980e-01
  1.043e-01 1.809e-01 5.883e-02 7.572e-01 7.813e-01 5.438e+00 9.444e+01
  1.149e-02 2.461e-02 5.688e-02 1.885e-02 1.756e-02 5.115e-03 2.254e+01
  1.667e+01 1.522e+02 1.575e+03 1.374e-01 2.050e-01 4.000e-01 1.625e-01
  2.364e-01 7.678e-02]]
In [ ]:
# print the cancer labels (0:malignant, 1:benign)
print(cancer.target)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1]
In [ ]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=0)
In [ ]:
#Import svm model
from sklearn import svm
#Create a svm Classifier
clf = svm. SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
In [ ]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
score = metrics.accuracy_score(y_test, y_pred)
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.956140350877193
In [ ]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:", metrics.precision_score(y_test, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:", metrics.recall_score(y_test, y_pred))
Precision: 0.984375
Recall: 0.9402985074626866
In [ ]:
# confusion matrix
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, y_pred)
cm
Out[ ]:
array([[46,  1],
       [ 4, 63]], dtype=int64)
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Spectral')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'SVM model accuracy(in %): {0}'.format(score*100)
plt.title(all_sample_title, size = 15)
Out[ ]:
Text(0.5, 1.0, 'SVM model accuracy(in %): 95.6140350877193')

Unsupervised MAchine Learning¶

  • learning without teacher
  • we dont have any labels here only data
  • we then ahave o specify the data by own

Clustring¶

  • Grouping of individuals based on their common characteristics
  • also known as segmentation
  • We dont have labels here , so we will define labels in clusters on basis of available features

Clusters: A group of object that are similar to other subjects in the clusters adn dis-similar to other data point in other clusters

Clustering Classification
Un Supervised Supervised
No training data Labeled data
Unalbeled data Labeled data
Define labels using data then in algorithm Data>Model>Training>Classification

Clustring Algorithms¶

  • Partition based Clustring \ Relatively speed is high
    • K means
    • k mediod
    • Fuzzy c-means
  • Hirarical clustring (tree wali)
    • Agglomarative
    • Divisive
  • Density Based Clustring\ Arbitrary shaped clusters

K-Means Clustering¶

It is the simplest and commonly used iterative type unsupervised learning algorithm. In this, we randomly initialize the K number of centroids in the data and iterates these centroids until no change happens to the position of the centroid. Let’s go through the steps involved in K means clustering for a better understanding.

1) Select the number of clusters for the dataset ( K )

2) Select K number of centroids

3) By calculating the Euclidean distance or Manhattan distance assign the points to the nearest centroid, thus creating K groups

4) Now find the original centroid in each group

5) Again reassign the whole data point based on this new centroid, then repeat step 4 until the position of the centroid doesn’t change.

Explained in Example 3

Example 1¶

In [ ]:
#importing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.cluster import KMeans
import sklearn.datasets._samples_generator
In [ ]:
#The code below will build a 2D dataset with four blobs.
from sklearn.datasets._samples_generator import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
In [ ]:
#visualizing the dataset.
plt.scatter(X[:, 0], X[:, 1], s=10); # S is size of dots in data we are visualizing 
plt.show()
In [ ]:
#create a K – means object while specifying the number of clusters, train the model, and estimate as
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
In [ ]:
#plot and visualize the cluster’s centers as determined by the k-means Python estimator
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=20, cmap='summer')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='blue', s=100, alpha=0.9);
plt.show()

Example 2¶

we will use K-means clustering on a simple digit’s dataset. Without relying on the original label information, K-means will try to identify numbers that are similar.

In [ ]:
# Importing libraries 
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.cluster import KMeans
In [ ]:
#Load the digit dataset from sklearn and create an object out of it.
#Additionally, we can get the total number of rows and the total number of columns in this dataset

from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
Out[ ]:
(1797, 64)
In [ ]:
#We may cluster the data in the same way that we did in Example 1 above
kmeans = KMeans(n_clusters=10, random_state=0)
clusters = kmeans.fit_predict(digits.data)
# Checking how many clusters are created by K-means and how many feature does we have 
kmeans.cluster_centers_.shape
#indicates that K-means generated 10 clusters with 64 features
Out[ ]:
(10, 64)
In [ ]:
# Checking the centers of cluster 
fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
   axi.set(xticks=[], yticks=[])
   axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)

#As a result, we will receive the picture below, which shows clusters centers learned by k-means.
In [ ]:
# Checking the learned cluster labels with the actual labels
from scipy.stats import mode
labels = np.zeros_like(clusters)
for i in range(10):
   mask = (clusters == i)
   labels[mask] = mode(digits.target[mask])[0]

# Following that, we can check the accuracy as follows:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, labels)
Out[ ]:
0.7935447968836951

Example 3¶

In [ ]:
# Importing libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import sklearn
In [ ]:
#import the dataset and slice the important features
dataset = pd.read_csv('Mall_Customers.csv')
dataset.head()
Out[ ]:
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
In [ ]:
X = dataset.iloc[:, [3, 4]].values
X
Out[ ]:
array([[ 15,  39],
       [ 15,  81],
       [ 16,   6],
       [ 16,  77],
       [ 17,  40],
       [ 17,  76],
       [ 18,   6],
       [ 18,  94],
       [ 19,   3],
       [ 19,  72],
       [ 19,  14],
       [ 19,  99],
       [ 20,  15],
       [ 20,  77],
       [ 20,  13],
       [ 20,  79],
       [ 21,  35],
       [ 21,  66],
       [ 23,  29],
       [ 23,  98],
       [ 24,  35],
       [ 24,  73],
       [ 25,   5],
       [ 25,  73],
       [ 28,  14],
       [ 28,  82],
       [ 28,  32],
       [ 28,  61],
       [ 29,  31],
       [ 29,  87],
       [ 30,   4],
       [ 30,  73],
       [ 33,   4],
       [ 33,  92],
       [ 33,  14],
       [ 33,  81],
       [ 34,  17],
       [ 34,  73],
       [ 37,  26],
       [ 37,  75],
       [ 38,  35],
       [ 38,  92],
       [ 39,  36],
       [ 39,  61],
       [ 39,  28],
       [ 39,  65],
       [ 40,  55],
       [ 40,  47],
       [ 40,  42],
       [ 40,  42],
       [ 42,  52],
       [ 42,  60],
       [ 43,  54],
       [ 43,  60],
       [ 43,  45],
       [ 43,  41],
       [ 44,  50],
       [ 44,  46],
       [ 46,  51],
       [ 46,  46],
       [ 46,  56],
       [ 46,  55],
       [ 47,  52],
       [ 47,  59],
       [ 48,  51],
       [ 48,  59],
       [ 48,  50],
       [ 48,  48],
       [ 48,  59],
       [ 48,  47],
       [ 49,  55],
       [ 49,  42],
       [ 50,  49],
       [ 50,  56],
       [ 54,  47],
       [ 54,  54],
       [ 54,  53],
       [ 54,  48],
       [ 54,  52],
       [ 54,  42],
       [ 54,  51],
       [ 54,  55],
       [ 54,  41],
       [ 54,  44],
       [ 54,  57],
       [ 54,  46],
       [ 57,  58],
       [ 57,  55],
       [ 58,  60],
       [ 58,  46],
       [ 59,  55],
       [ 59,  41],
       [ 60,  49],
       [ 60,  40],
       [ 60,  42],
       [ 60,  52],
       [ 60,  47],
       [ 60,  50],
       [ 61,  42],
       [ 61,  49],
       [ 62,  41],
       [ 62,  48],
       [ 62,  59],
       [ 62,  55],
       [ 62,  56],
       [ 62,  42],
       [ 63,  50],
       [ 63,  46],
       [ 63,  43],
       [ 63,  48],
       [ 63,  52],
       [ 63,  54],
       [ 64,  42],
       [ 64,  46],
       [ 65,  48],
       [ 65,  50],
       [ 65,  43],
       [ 65,  59],
       [ 67,  43],
       [ 67,  57],
       [ 67,  56],
       [ 67,  40],
       [ 69,  58],
       [ 69,  91],
       [ 70,  29],
       [ 70,  77],
       [ 71,  35],
       [ 71,  95],
       [ 71,  11],
       [ 71,  75],
       [ 71,   9],
       [ 71,  75],
       [ 72,  34],
       [ 72,  71],
       [ 73,   5],
       [ 73,  88],
       [ 73,   7],
       [ 73,  73],
       [ 74,  10],
       [ 74,  72],
       [ 75,   5],
       [ 75,  93],
       [ 76,  40],
       [ 76,  87],
       [ 77,  12],
       [ 77,  97],
       [ 77,  36],
       [ 77,  74],
       [ 78,  22],
       [ 78,  90],
       [ 78,  17],
       [ 78,  88],
       [ 78,  20],
       [ 78,  76],
       [ 78,  16],
       [ 78,  89],
       [ 78,   1],
       [ 78,  78],
       [ 78,   1],
       [ 78,  73],
       [ 79,  35],
       [ 79,  83],
       [ 81,   5],
       [ 81,  93],
       [ 85,  26],
       [ 85,  75],
       [ 86,  20],
       [ 86,  95],
       [ 87,  27],
       [ 87,  63],
       [ 87,  13],
       [ 87,  75],
       [ 87,  10],
       [ 87,  92],
       [ 88,  13],
       [ 88,  86],
       [ 88,  15],
       [ 88,  69],
       [ 93,  14],
       [ 93,  90],
       [ 97,  32],
       [ 97,  86],
       [ 98,  15],
       [ 98,  88],
       [ 99,  39],
       [ 99,  97],
       [101,  24],
       [101,  68],
       [103,  17],
       [103,  85],
       [103,  23],
       [103,  69],
       [113,   8],
       [113,  91],
       [120,  16],
       [120,  79],
       [126,  28],
       [126,  74],
       [137,  18],
       [137,  83]], dtype=int64)

Elbow Method¶

n the Elbow method, we are actually varying the number of clusters ( K ) from 1 – 10. For each value of K, we are calculating WCSS ( Within-Cluster Sum of Square ). WCSS is the sum of squared distance between each point and the centroid in a cluster. When we plot the WCSS with the K value, the plot looks like an Elbow. As the number of clusters increases, the WCSS value will start to decrease. WCSS value is largest when K = 1. When we analyze the graph we can see that the graph will rapidly change at a point and thus creating an elbow shape. From this point, the graph starts to move almost parallel to the X-axis. The K value corresponding to this point is the optimal K value or an optimal number of clusters.

In [ ]:
#find the optimal K value for clustering the data.
#Now we are using the Elbow method to find the optimal K value.
from sklearn.cluster import KMeans
wcss = [] 
for i in range(1, 11): 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    #“init” argument is the method for initializing the centroid

    kmeans.fit(X) 
    wcss.append(kmeans.inertia_)
    #We calculated the WCSS value for each K value. 
In [ ]:
# Now we have to plot the WCSS with K value
plt.plot(range(1, 11), wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.show()
#The point at which the elbow shape is created is 5, that is, our K value or an optimal number of clusters is 5.
In [ ]:
#Now let’s train the model on the dataset with a number of clusters 5.
kmeans = KMeans(n_clusters = 5, init = "k-means++", random_state = 42)
y_kmeans = kmeans.fit_predict(X)
y_kmeans
#y_kmeans give us different clusters corresponding to X.
Out[ ]:
array([2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
       2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0,
       2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 4, 0, 4, 1, 4, 1, 4,
       0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4])
In [ ]:
#Now let’s plot all the clusters using matplotlib.
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 60, c = 'red', label = 'Cluster1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 60, c = 'blue', label = 'Cluster2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 60, c = 'green', label = 'Cluster3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 60, c = 'violet', label = 'Cluster4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 60, c = 'yellow', label = 'Cluster5') 
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 100, c = 'black', label = 'Centroids')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend() 

plt.show()